[Xen-devel] [Resend Fix PATCH] Qemu/Xen: Fix early freeing MSIX MMIO memory region
MSIX MMIO memory region is added to pt device's obj as property. When pt device is unplugged, all properties will be deleted and memory region's obj is needed at that point(refer object_finalize_child_property()). But current code frees MSIX MMIO memory region in the xen_pt_msix_delete() before deleting pt device's properties, this will cause segment fault. Reproduce the bug via hotplugging device frequently. This patch is to fix the issue via moving MSIX MMIO memory region into struct XenPCIPassthroughState and free it together with pt device's obj. Signed-off-by: Lan Tianyu --- Cc Xen devel maillist hw/xen/xen_pt.c |4 ++-- hw/xen/xen_pt.h |2 +- hw/xen/xen_pt_msi.c |6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c index 2b54f52..0c11069 100644 --- a/hw/xen/xen_pt.c +++ b/hw/xen/xen_pt.c @@ -587,11 +587,11 @@ static void xen_pt_region_update(XenPCIPassthroughState *s, }; bar = xen_pt_bar_from_region(s, mr); -if (bar == -1 && (!s->msix || &s->msix->mmio != mr)) { +if (bar == -1 && (!s->msix || &s->msix_mmio != mr)) { return; } -if (s->msix && &s->msix->mmio == mr) { +if (s->msix && &s->msix_mmio == mr) { if (adding) { s->msix->mmio_base_addr = sec->offset_within_address_space; rc = xen_pt_msix_update_remap(s, s->msix->bar_index); diff --git a/hw/xen/xen_pt.h b/hw/xen/xen_pt.h index 3bc22eb..3569c2c 100644 --- a/hw/xen/xen_pt.h +++ b/hw/xen/xen_pt.h @@ -199,7 +199,6 @@ typedef struct XenPTMSIX { uint64_t table_base; uint32_t table_offset_adjust; /* page align mmap */ uint64_t mmio_base_addr; -MemoryRegion mmio; void *phys_iomem_base; XenPTMSIXEntry msix_entry[0]; } XenPTMSIX; @@ -222,6 +221,7 @@ struct XenPCIPassthroughState { MemoryRegion bar[PCI_NUM_REGIONS - 1]; MemoryRegion rom; +MemoryRegion msix_mmio; MemoryListener memory_listener; MemoryListener io_listener; diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c index e3d7194..ae39ab3 100644 --- a/hw/xen/xen_pt_msi.c +++ b/hw/xen/xen_pt_msi.c @@ -558,7 +558,7 @@ int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t base) msix->msix_entry[i].pirq = XEN_PT_UNASSIGNED_PIRQ; } -memory_region_init_io(&msix->mmio, OBJECT(s), &pci_msix_ops, +memory_region_init_io(&s->msix_mmio, OBJECT(s), &pci_msix_ops, s, "xen-pci-pt-msix", (total_entries * PCI_MSIX_ENTRY_SIZE + XC_PAGE_SIZE - 1) @@ -599,7 +599,7 @@ int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t base) msix->phys_iomem_base); memory_region_add_subregion_overlap(&s->bar[bar_index], table_off, -&msix->mmio, +&s->msix_mmio, 2); /* Priority: pci default + 1 */ return 0; @@ -626,7 +626,7 @@ void xen_pt_msix_delete(XenPCIPassthroughState *s) + msix->table_offset_adjust); } -memory_region_del_subregion(&s->bar[msix->bar_index], &msix->mmio); +memory_region_del_subregion(&s->bar[msix->bar_index], &s->msix_mmio); g_free(s->msix); s->msix = NULL; -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH] Qemu/Xen: Fix early freeing MSIX MMIO memory region
From: > msix->mmio is added to XenPCIPassthroughState's object as property. object_finalize_child_property is called for XenPCIPassthroughState's object, which calls object_property_del_all, which is going to try to delete msix->mmio. object_finalize_child_property() will access msix->mmio's obj. But the whole msix struct has already been freed by xen_pt_msix_delete. This will cause segment fault when msix->mmio has been overwritten. This patch is to fix the issue. Signed-off-by: Lan Tianyu --- hw/xen/xen_pt.c |8 hw/xen/xen_pt.h |1 + hw/xen/xen_pt_config_init.c |2 +- hw/xen/xen_pt_msi.c | 13 - 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c index 2b54f52..aa96288 100644 --- a/hw/xen/xen_pt.c +++ b/hw/xen/xen_pt.c @@ -938,10 +938,18 @@ static void xen_pci_passthrough_class_init(ObjectClass *klass, void *data) dc->props = xen_pci_passthrough_properties; }; +static void xen_pci_passthrough_finalize(Object *obj) +{ +XenPCIPassthroughState *s = XEN_PT_DEVICE(obj); + +xen_pt_msix_delete(s); +} + static const TypeInfo xen_pci_passthrough_info = { .name = TYPE_XEN_PT_DEVICE, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(XenPCIPassthroughState), +.instance_finalize = xen_pci_passthrough_finalize, .class_init = xen_pci_passthrough_class_init, }; diff --git a/hw/xen/xen_pt.h b/hw/xen/xen_pt.h index 3bc22eb..c545280 100644 --- a/hw/xen/xen_pt.h +++ b/hw/xen/xen_pt.h @@ -305,6 +305,7 @@ void xen_pt_msi_disable(XenPCIPassthroughState *s); int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t base); void xen_pt_msix_delete(XenPCIPassthroughState *s); +void xen_pt_msix_unmap(XenPCIPassthroughState *s); int xen_pt_msix_update(XenPCIPassthroughState *s); int xen_pt_msix_update_remap(XenPCIPassthroughState *s, int bar_index); void xen_pt_msix_disable(XenPCIPassthroughState *s); diff --git a/hw/xen/xen_pt_config_init.c b/hw/xen/xen_pt_config_init.c index 4a5bc11..0efee11 100644 --- a/hw/xen/xen_pt_config_init.c +++ b/hw/xen/xen_pt_config_init.c @@ -2079,7 +2079,7 @@ void xen_pt_config_delete(XenPCIPassthroughState *s) /* free MSI/MSI-X info table */ if (s->msix) { -xen_pt_msix_delete(s); +xen_pt_msix_unmap(s); } g_free(s->msi); diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c index e3d7194..82de2bc 100644 --- a/hw/xen/xen_pt_msi.c +++ b/hw/xen/xen_pt_msi.c @@ -610,7 +610,7 @@ error_out: return rc; } -void xen_pt_msix_delete(XenPCIPassthroughState *s) +void xen_pt_msix_unmap(XenPCIPassthroughState *s) { XenPTMSIX *msix = s->msix; @@ -627,6 +627,17 @@ void xen_pt_msix_delete(XenPCIPassthroughState *s) } memory_region_del_subregion(&s->bar[msix->bar_index], &msix->mmio); +} + +void xen_pt_msix_delete(XenPCIPassthroughState *s) +{ +XenPTMSIX *msix = s->msix; + +if (!msix) { +return; +} + +object_unparent(OBJECT(&msix->mmio)); g_free(s->msix); s->msix = NULL; -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] Xen Q35 & virtual VTD support
Hi All: We are researching how to add virtual VTD support for Xen HVM guest. Current qemu has a basic virtual VTD support for Q35. I'd like to confirm whether Xen supports Q35 or not. Can we reuse it for Xen? Thanks. The motivations of adding virtual VTD support for Xen prepare for 1) Shared Virtual Memory (SVM) 2) Increase max VCPUs > 255 (The feature relies on virtual VTD irq remapping function.) ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Xen Q35 & virtual VTD support
On 5/10/2016 6:11 PM, Stefano Stabellini wrote: Hello Tianyu, I am CC'ing Anthony who should have a better idea about this. Also please use my kernel.org email address for future correspondence. OK. I get it. What do you mean by reusing Q35 for Xen? If you mean using QEMU to emulate a Q35 based machine for HVM guests, I think that should be OK. From xl code, I findit passes pc, xenpv or xenfv as "-machine" param to Qemu except Q35. I also tried changing code to select Q35 in the xl but guest didn't boot up. So I want to check whether Xen supports Q35 in the current code or not. If yes, we can reuse Qemu virtual VTD for Xen with minor changes. Thanks, Stefano On Mon, 9 May 2016, Lan, Tianyu wrote: Hi All: We are researching how to add virtual VTD support for Xen HVM guest. Current qemu has a basic virtual VTD support for Q35. I'd like to confirm whether Xen supports Q35 or not. Can we reuse it for Xen? Thanks. The motivations of adding virtual VTD support for Xen prepare for 1) Shared Virtual Memory (SVM) 2) Increase max VCPUs > 255 (The feature relies on virtual VTD irq remapping function.) ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Xen Q35 & virtual VTD support
On 5/10/2016 10:52 PM, Anthony PERARD wrote: On Tue, May 10, 2016 at 10:31:38PM +0800, Lan, Tianyu wrote: On 5/10/2016 6:11 PM, Stefano Stabellini wrote: Hello Tianyu, What do you mean by reusing Q35 for Xen? If you mean using QEMU to emulate a Q35 based machine for HVM guests, I think that should be OK. From xl code, I findit passes pc, xenpv or xenfv as "-machine" param to Qemu except Q35. I also tried changing code to select Q35 in the xl but guest didn't boot up. So I want to check whether Xen supports Q35 in the current code or not. No, Xen do not support Q35. But that is possible in the future, I did some work on it in the past, you can find it here if you are curious: http://xenbits.xen.org/gitweb/?p=people/aperard/xen-unstable.git;a=shortlog;h=refs/heads/machine-q35-wip I'm not sure if it's going to help you. Great. Thanks a lot for your help. BTW, do you have plan to upstream these patches? ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH] Xen/timer: Disable watchdog during dumping timer queues
On a machine with a mount of cpus, dump_timerq() lasts several seconds which may exceed watchdog timeout and cause Xen hyperviosr reboot. This patch is to disable watchdog when dump timer queues to fix the issue. Signed-off-by: Lan Tianyu --- xen/common/timer.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/xen/common/timer.c b/xen/common/timer.c index 29a60a9..2d9d828 100644 --- a/xen/common/timer.c +++ b/xen/common/timer.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -524,6 +525,7 @@ static void dump_timerq(unsigned char key) s_time_t now = NOW(); inti, j; +watchdog_disable(); printk("Dumping timer queues:\n"); for_each_online_cpu( i ) @@ -538,6 +540,8 @@ static void dump_timerq(unsigned char key) dump_timer(t, now); spin_unlock_irqrestore(&ts->lock, flags); } + +watchdog_enable(); } static void migrate_timers_from_cpu(unsigned int old_cpu) -- 1.7.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen/timer: Disable watchdog during dumping timer queues
On 9/13/2016 11:25 PM, Jan Beulich wrote: Wait - what is do_invalid_op() doing on the stack? I don't think it belongs there, and hence I wonder whether the keypress happened after some already fatal event (in which case all bets are off anyway). Not clear why do_invalid_op() on the stack. There is no other fatal event. The issue disappears when set watchdog_timeout to 10s. > Another solution is to schedule a tasklet to run keyhandler in timer > handler and invoke process_pending_softirqs() in the dump_timerq(). > This also works but it requires to rework keyhandler mechanism. > > Disable watchdog seems to be simpler and I found dump_registers() also > used the same way to deal with the issue. That's true. Just that on large machines it defaults to the alternative model, for which I'm not sure it actually needs the watchdog disabled (as data for a single CPU shouldn't exceed the threshold). It seems not to be necessary to disable watchdog in alternative model since dumping a single cpu's status will not last a long time. For the issue in the dump timer info handler, disabling watchdog is ok for you or you have other suggestions to resolve the issue? I also found other places where dump a lot of logs disable watchdog. (E,G run_all_keyhandlers(), debugtrace_dump() debugtrace_toggle() and so on). This seems a common solution. Jan > Here is my draft patch of reworking keyhandler. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc
Hi Andrew: Sorry to bother you. To make sure we are on the right direction, it's better to get feedback from you before we go further step. Could you have a look? Thanks. On 8/17/2016 8:05 PM, Lan, Tianyu wrote: Hi All: The following is our Xen vIOMMU high level design for detail discussion. Please have a look. Very appreciate for your comments. This design doesn't cover changes when root port is moved to hypervisor. We may design it later. Content: === 1. Motivation of vIOMMU 1.1 Enable more than 255 vcpus 1.2 Support VFIO-based user space driver 1.3 Support guest Shared Virtual Memory (SVM) 2. Xen vIOMMU Architecture 2.1 2th level translation overview 2.2 Interrupt remapping overview 3. Xen hypervisor 3.1 New vIOMMU hypercall interface 3.2 2nd level translation 3.3 Interrupt remapping 3.4 1st level translation 3.5 Implementation consideration 4. Qemu 4.1 Qemu vIOMMU framework 4.2 Dummy xen-vIOMMU driver 4.3 Q35 vs. i440x 4.4 Report vIOMMU to hvmloader 1 Motivation for Xen vIOMMU === 1.1 Enable more than 255 vcpu support HPC virtualization requires more than 255 vcpus support in a single VM to meet parallel computing requirement. More than 255 vcpus support requires interrupt remapping capability present on vIOMMU to deliver interrupt to #vcpu >255 Otherwise Linux guest fails to boot up with >255 vcpus if interrupt remapping is absent. 1.2 Support VFIO-based user space driver (e.g. DPDK) in the guest It relies on the 2nd level translation capability (IOVA->GPA) on vIOMMU. pIOMMU 2nd level becomes a shadowing structure of vIOMMU to isolate DMA requests initiated by user space driver. 1.3 Support guest SVM (Shared Virtual Memory) It relies on the 1st level translation table capability (GVA->GPA) on vIOMMU. pIOMMU needs to enable both 1st level and 2nd level translation in nested mode (GVA->GPA->HPA) for passthrough device. IGD passthrough is the main usage today (to support OpenCL 2.0 SVM feature). In the future SVM might be used by other I/O devices too. 2. Xen vIOMMU Architecture * vIOMMU will be inside Xen hypervisor for following factors 1) Avoid round trips between Qemu and Xen hypervisor 2) Ease of integration with the rest of the hypervisor 3) HVMlite/PVH doesn't use Qemu * Dummy xen-vIOMMU in Qemu as a wrapper of new hypercall to create /destory vIOMMU in hypervisor and deal with virtual PCI device's 2th level translation. 2.1 2th level translation overview For Virtual PCI device, dummy xen-vIOMMU does translation in the Qemu via new hypercall. For physical PCI device, vIOMMU in hypervisor shadows IO page table from IOVA->GPA to IOVA->HPA and load page table to physical IOMMU. The following diagram shows 2th level translation architecture. +-+ |Qemu++ | || Virtual| | || PCI device | | ||| | |++ | ||DMA | |V| | ++ Request ++ | | |+<---+| | | | Dummy xen vIOMMU | Target GPA | Memory region | | | |+--->+| | | +-+--++---++ | || || ||Hypercall || +++ |Hypervisor | || || || |v || | +--+--+|| | | vIOMMU||| | +--+--+|| || || |v || | +--+--+|| | | IOMMU driver||| | +--+--+|| || || +++ |HW v V| | +--+--+ +-+
Re: [Xen-devel] [PATCH] Xen/timer: Disable watchdog during dumping timer queues
On 9/15/2016 10:32 PM, Jan Beulich wrote: On 15.09.16 at 16:16, wrote: On 9/13/2016 11:25 PM, Jan Beulich wrote: Wait - what is do_invalid_op() doing on the stack? I don't think it belongs there, and hence I wonder whether the keypress happened after some already fatal event (in which case all bets are off anyway). Not clear why do_invalid_op() on the stack. There is no other fatal event. The issue disappears when set watchdog_timeout to 10s. Another solution is to schedule a tasklet to run keyhandler in timer handler and invoke process_pending_softirqs() in the dump_timerq(). This also works but it requires to rework keyhandler mechanism. Disable watchdog seems to be simpler and I found dump_registers() also used the same way to deal with the issue. That's true. Just that on large machines it defaults to the alternative model, for which I'm not sure it actually needs the watchdog disabled (as data for a single CPU shouldn't exceed the threshold). It seems not to be necessary to disable watchdog in alternative model since dumping a single cpu's status will not last a long time. For the issue in the dump timer info handler, disabling watchdog is ok for you or you have other suggestions to resolve the issue? Well, without a clear understanding of why the issue occurs (for which I need to refer you back to the questionable stack dump) I'm hesitant to agree to this step, yet ... After some researches, I found do_invalid_op() on the stack dump is caused by run_in_exception_handler(__ns16550_poll) in the ns16550_poll() rather than fatal event. The timeout issue still exists when run __ns16550_poll() directly in the ns16550_poll(). I also found other places where dump a lot of logs disable watchdog. (E,G run_all_keyhandlers(), debugtrace_dump() debugtrace_toggle() and so on). This seems a common solution. ... I'm also not entirely against it considering the various other examples. I.e. as almost always: As long as the need for the change can be properly explained, I won't stand in the way of getting it in. Jan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen/timer: Disable watchdog during dumping timer queues
On 9/19/2016 10:46 PM, Jan Beulich wrote: Well, without a clear understanding of why the issue occurs (for >> which I need to refer you back to the questionable stack dump) >> I'm hesitant to agree to this step, yet ... > > After some researches, I found do_invalid_op() on the stack dump is > caused by run_in_exception_handler(__ns16550_poll) in the ns16550_poll() > rather than fatal event. The timeout issue still exists when run > __ns16550_poll() directly in the ns16550_poll(). Well, I then still don't see why e.g. dump_domains() doesn't also need it. After testing, dump_domains() also has such issue after I create two VM with 128 vcpus. Earlier you did say: Keyhandler may run in the timer handler and the following log shows calltrace. The timer subsystem run all expired timers' handler before programing next timer event. If keyhandler runs longer than timeout, there will be no chance to configure timer before triggering watchdog and hypervisor rebooting. The fact that using debug keys may adversely affect the rest of the system is known. And the nesting of process_pending_softirqs() inside do_softirq() should, from looking at them, work fine. So I continue to have trouble seeing the specific reason for the problem you say you observe. The precondition of process_pending_softirq() working in the debug key handler is that timer interrupt arrives on time and nmi_timer_fn() can run to update nmi_timer_ticks before watchdog timeout. When a timer interrupt arrives, timer_softirq_action() will run all expired timer handlers before programing next timer interrupt via reprogram_timer(). If a timer handler runs too long E,G >5s(Time for watchdog timeout is default to be 5s.), this will cause no timer interrupt arriving within 5s and nmi_timer_fn() also won't be called. Does this make sense to you? And as a separate note - dump_registers() is quite an exception among the key handlers, and that's for a good reason (as the comment there says). So I continue to be hesitant to see this spread to other key handlers. Jan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen/timer: Disable watchdog during dumping timer queues
On 2016年09月20日 23:36, Jan Beulich wrote: >> The precondition of process_pending_softirq() working in the debug key >> > handler is that timer interrupt arrives on time and nmi_timer_fn() can >> > run to update nmi_timer_ticks before watchdog timeout. > Precondition? Process_pending_softirq() in debug key handler is mainly to deal with timer softirq to update nmi_timer_ticks in order to avoid NMI watchdog. If there is no timer interrupt arriving for long time, process_pending_softirq() here is meaningless and NMI watchdog still will be timeout. > >> > When a timer interrupt arrives, timer_softirq_action() will run all >> > expired timer handlers before programing next timer interrupt via >> > reprogram_timer(). If a timer handler runs too long E,G >5s(Time for >> > watchdog timeout is default to be 5s.), this will cause no timer >> > interrupt arriving within 5s and nmi_timer_fn() also won't be called. >> > Does this make sense to you? > Partly. I continue to think that the sequence > > some keyhandler > timer interrupt > keyhandler continues > keyhandler calls process_pending_softirq() > Question for your sequence is why there is timer interrupt before programing timer interrupt. Actually the sequence in this case is timer interrupt run key handlers in timer handler program next timer interrupt ... > should, among other things, result in timer_softirq_action() to get > run. And I don't see the _timer_ handler running for to long here, > only a key handler. Key handler may run a long time(E,G >5s) on machine with amount of cpus or create huge VM. If keyhandler doesn't run for long time, timer_softirq_action() would also be not necessary since the default timeout is 5s and nmi timer's interval is 1s. > Are you perhaps instead suffering from the > nested instance of timer_softirq_action() not being able to acquire > its lock? No, the serial port continues printing timer info before watchdog timeout. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen/timer: Disable watchdog during dumping timer queues
On 9/21/2016 5:25 PM, Jan Beulich wrote: On 21.09.16 at 03:54, wrote: On 2016年09月20日 23:36, Jan Beulich wrote: The precondition of process_pending_softirq() working in the debug key handler is that timer interrupt arrives on time and nmi_timer_fn() can run to update nmi_timer_ticks before watchdog timeout. Precondition? Process_pending_softirq() in debug key handler is mainly to deal with timer softirq to update nmi_timer_ticks in order to avoid NMI watchdog. If there is no timer interrupt arriving for long time, process_pending_softirq() here is meaningless and NMI watchdog still will be timeout. Oh, right. Still I continue to be unconvinced that disabling the watchdog is the right answer (not running timers for a long time has other undesirable consequence), or if it is, then it being needed in only this one key handler. So perhaps you should really consider submitting your generic key handler adjustment as an alternative. Disable watchdog is common solution for such kind of issues in current codes and so I chose it. I also proposed another solution in previous mail that run keyhandler always in a tasklet and insert process_pending_softirq() in the keyhandler. But please also answer the earlier question, which you did strip from your reply: Which btw raises another question: Why are you in polling mode in the first place? Do you have a UART without working interrupt? I found there was no interrupt with Xen ns16550 dirver while linux kernel's serial driver can receive interrupt. Jan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen/timer: Disable watchdog during dumping timer queues
On 9/22/2016 10:26 PM, Jan Beulich wrote: But please also answer the earlier question, which you did strip >> from your reply: >> >>> Which btw raises another question: Why are you in polling mode in >>> the first place? Do you have a UART without working interrupt? > > I found there was no interrupt with Xen ns16550 dirver while > linux kernel's serial driver can receive interrupt. And do you know the reason? Is it perhaps a PCI plug in card, and you don't specify the IRQ on the command line? Or the kernel doesn't provide the necessary information (from ACPI) for Xen to set up that IRQ? No, I am not familiar serial device. But it's a ACPI device from linux sysfs node and serial drivers use irq 4 for their interrupt both on linux and Xen. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [Resend PATCH 1/2] Xen/Keyhandler: Make keyhandler always run in tasklet
Keyhandler may run for a long time in a timer handler on the large machine with a lot of physical cpus(E,G keyhandler for dumping timer info) when serial port driver works in the poll mode. When timer interrupt arrives, timer subsystem runs all timer handlers before programming next timer interrupt. So if timer handler runs longer than time for watchdog timeout, the timer handler of watchdog will be blocked to feed watchdog and xen hypervisor panics. This patch is to fix the issue via always scheduling a tasklet to run keyhandler to avoid timer handler running too long. Signed-off-by: Lan Tianyu --- xen/common/keyhandler.c |8 +--- 1 files changed, 5 insertions(+), 3 deletions(-) diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 16de6e8..fce52d2 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -75,7 +75,9 @@ static struct keyhandler { static void keypress_action(unsigned long unused) { -handle_keypress(keypress_key, NULL); +console_start_log_everything(); +key_table[keypress_key].fn(keypress_key); +console_end_log_everything(); } static DECLARE_TASKLET(keypress_tasklet, keypress_action, 0); @@ -87,10 +89,10 @@ void handle_keypress(unsigned char key, struct cpu_user_regs *regs) if ( key >= ARRAY_SIZE(key_table) || !(h = &key_table[key])->fn ) return; -if ( !in_irq() || h->irq_callback ) +if ( h->irq_callback ) { console_start_log_everything(); -h->irq_callback ? h->irq_fn(key, regs) : h->fn(key); +h->irq_fn(key, regs); console_end_log_everything(); } else -- 1.7.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [Resend PATCH 0/2] Xen: Fix Xen hypervisor panic during dumping timer info on huge machine.
Resend because the patchset seems to miss xen devel maillist. This patchset is to fix triggering NMI watchdog during dump timer info on the huge machine with a mount of physical cpus. Detail please see change log of Patch 1. Previous discussion: https://patchwork.kernel.org/patch/9328449/ Lan Tianyu (2): Xen/Keyhandler: Make keyhandler always run in tasklet Xen/timer: Process softirq during dumping timer info xen/common/keyhandler.c |8 +--- xen/common/timer.c |1 + 2 files changed, 6 insertions(+), 3 deletions(-) LocalWords: 8f82fa7cd8f2407b92d6994a65084951cf28a247 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [Resend PATCH 2/2] Xen/timer: Process softirq during dumping timer info
Dumping timer info may run for a long time on the huge machine with a lot of physical cpus. To avoid triggering NMI watchdog, add process_pending_softirqs() in the loop of dumping timer info. Signed-off-by: Lan Tianyu --- xen/common/timer.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/xen/common/timer.c b/xen/common/timer.c index 29a60a9..ab6bca0 100644 --- a/xen/common/timer.c +++ b/xen/common/timer.c @@ -530,6 +530,7 @@ static void dump_timerq(unsigned char key) { ts = &per_cpu(timers, i); +process_pending_softirqs(); printk("CPU%02d:\n", i); spin_lock_irqsave(&ts->lock, flags); for ( j = 1; j <= GET_HEAP_SIZE(ts->heap); j++ ) -- 1.7.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [Resend PATCH 1/2] Xen/Keyhandler: Make keyhandler always run in tasklet
Hi Konrad: Thanks for your review. On 2016年10月01日 02:07, Konrad Rzeszutek Wilk wrote: > On Fri, Sep 30, 2016 at 10:19:05AM +0800, Lan Tianyu wrote: >> Keyhandler may run for a long time in a timer handler on the large machine > > I am bit lost. > > You say 'timer handler' which will imply that there is some form > of 'init_timer' and 'set_timer' that would call the handle_keypress > function? > But I am not seeing it? > > Or are you saying that when 'dump_timerq' is invoked? > If so please say that. When serial port driver works in the poll mode, it will set a regular timer to deal with all input key and keyhandler(e,g dump_timerq()) will run in the timer handler. > >> with a lot of physical cpus(E,G keyhandler for dumping timer info) when >> serial > > s/E,G/e.g.g/ > >> port driver works in the poll mode. When timer interrupt arrives, timer >> subsystem > > s/poll mode/poll mode (via the exception mechanism)/ > >> runs all timer handlers before programming next timer interrupt. So if timer >> handler >> runs longer than time for watchdog timeout, the timer handler of watchdog >> will be > > Ah, so this is if a guest has set a timer and we are executing it. Or we have > many of them to go through. I meant the serial port timer handler here which calls keyhandler will run long time, no APIC timer interrupt will arrive to trigger timer softirq and feed watchdog during this procedure. Because there is no chance to program timer interrupt before completing all timer handlers in this case. > >> blocked to feed watchdog and xen hypervisor panics. This patch is to fix the >> issue >> via always scheduling a tasklet to run keyhandler to avoid timer handler >> running >> too long. > > You say "timer handler" again. But the timer handlers are executed via > timer_softirq_action (which is a softirq, aka triggered by IPI). In this case, APIC timer interrupt handler apic_timer_interrupt() triggers timer softirq and runs all expired timer handlers in timer softirq. > > And the tasklet will mean that that it gets to be executed _after_ the > do_softirq is done (as softirq.h puts the low numbered ones first, such > as the TIMER_SOFTIRQ)? > > So what I think you are saying is that you do not want the > 'timer_softirq_action' > to be preempted by the 'dump_timerq' (or any other ones) which will > trip the watchdog timeout. I want to make sure serial port timer handler doesn't run long time and not affect feed dog operation. > > If that is the case please put something to that affect in the > commit description. > > That begs one question that should be probably answered in the commit > description: > > Why can't the dump_timerq or any other keyhandler poke the watchdog > (expose nmi_timer_fn and call that?) Do you mean to feed nmi watchdog in the keyhandler directly? > >> >> Signed-off-by: Lan Tianyu > > Otherwise the mechanical parts of the patch look good. > >> --- >> xen/common/keyhandler.c |8 +--- >> 1 files changed, 5 insertions(+), 3 deletions(-) >> >> diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c >> index 16de6e8..fce52d2 100644 >> --- a/xen/common/keyhandler.c >> +++ b/xen/common/keyhandler.c >> @@ -75,7 +75,9 @@ static struct keyhandler { >> >> static void keypress_action(unsigned long unused) >> { >> -handle_keypress(keypress_key, NULL); >> +console_start_log_everything(); >> +key_table[keypress_key].fn(keypress_key); >> +console_end_log_everything(); >> } >> >> static DECLARE_TASKLET(keypress_tasklet, keypress_action, 0); >> @@ -87,10 +89,10 @@ void handle_keypress(unsigned char key, struct >> cpu_user_regs *regs) >> if ( key >= ARRAY_SIZE(key_table) || !(h = &key_table[key])->fn ) >> return; >> >> -if ( !in_irq() || h->irq_callback ) >> +if ( h->irq_callback ) >> { >> console_start_log_everything(); >> -h->irq_callback ? h->irq_fn(key, regs) : h->fn(key); >> +h->irq_fn(key, regs); >> console_end_log_everything(); >> } >> else >> -- >> 1.7.1 >> -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [Resend PATCH 1/2] Xen/Keyhandler: Make keyhandler always run in tasklet
On 2016年10月06日 20:52, Jan Beulich wrote: On 30.09.16 at 04:19, wrote: >> @@ -87,10 +89,10 @@ void handle_keypress(unsigned char key, struct >> cpu_user_regs *regs) >> if ( key >= ARRAY_SIZE(key_table) || !(h = &key_table[key])->fn ) >> return; >> >> -if ( !in_irq() || h->irq_callback ) >> +if ( h->irq_callback ) > > Please make subject/description reflect this: You don't _always_ > force the use of the tasklet. Ok. I also find register_irq_keyhandler() isn't called anywhere in current code and that means none uses irq_callback. Can we remove it? > > And then I don't think we want the debugkey sysctl get processed > asynchronously - the sysctl should complete only when the key has > been fully handled, in order to not interfere with a subsequent one > (namely the one retrieving the log buffer). We may introduce a new parameter for handle_keypress() to specify whether it should schedule a tasklet to run keyhandler or not. For sysctl case, it should be the later one. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [Resend PATCH 2/2] Xen/timer: Process softirq during dumping timer info
On 2016年10月06日 20:56, Jan Beulich wrote: On 30.09.16 at 04:19, wrote: >> --- a/xen/common/timer.c >> +++ b/xen/common/timer.c >> @@ -530,6 +530,7 @@ static void dump_timerq(unsigned char key) >> { >> ts = &per_cpu(timers, i); >> >> +process_pending_softirqs(); >> printk("CPU%02d:\n", i); >> spin_lock_irqsave(&ts->lock, flags); >> for ( j = 1; j <= GET_HEAP_SIZE(ts->heap); j++ ) > > Hmm - is that enough when there are many timers on one CPU? But > well, adding something inside the lock region would of course make > things quite a bit harder, so I guess this has to be enough for now. > Yes, it's hard to add process_pending_softirqs() under lock just like you said. I search init_timer() and there are 28 callers. Printing 28 lines of timer info is supposed to last a brief of time. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [Resend PATCH 1/2] Xen/Keyhandler: Make keyhandler always run in tasklet
On 2016年10月10日 21:55, Konrad Rzeszutek Wilk wrote: > On Sat, Oct 08, 2016 at 11:26:44AM +0800, Lan Tianyu wrote: >> On 2016年10月06日 20:52, Jan Beulich wrote: >>>>>> On 30.09.16 at 04:19, wrote: >>>> @@ -87,10 +89,10 @@ void handle_keypress(unsigned char key, struct >>>> cpu_user_regs *regs) >>>> if ( key >= ARRAY_SIZE(key_table) || !(h = &key_table[key])->fn ) >>>> return; >>>> >>>> -if ( !in_irq() || h->irq_callback ) >>>> +if ( h->irq_callback ) >>> >>> Please make subject/description reflect this: You don't _always_ >>> force the use of the tasklet. >> >> Ok. I also find register_irq_keyhandler() isn't called anywhere in >> current code and that means none uses irq_callback. Can we remove it? > > But it is. See IRQ_KEYHANDLER Oh. Yes. Thanks for your information. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc
On 2016年10月06日 02:36, Konrad Rzeszutek Wilk wrote: >>> 3.3 Interrupt remapping >>> > > Interrupts from virtual devices and physical devices will be delivered >>> > > to vlapic from vIOAPIC and vMSI. It needs to add interrupt remapping >>> > > hooks in the vmsi_deliver() and ioapic_deliver() to find target vlapic >>> > > according interrupt remapping table. The following diagram shows the >>> > > logic. >>> > > > Uh? Missing diagram? Sorry. This is stale statement. The diagram was moved to 2.2 Interrupt remapping overview. > >>> 4.3 Q35 vs i440x >>> > > VT-D is introduced since Q35 chipset. Previous concern was that IOMMU > s/since/with/ >>> > > driver has assumption that VTD only exists on Q35 and newer chipset and >>> > > we have to enable Q35 first. >>> > > >>> > > Consulted with Linux/Windows IOMMU driver experts and get that these >>> > > drivers doesn't have such assumption. So we may skip Q35 implementation >>> > > and can emulate vIOMMU on I440x chipset. KVM already have vIOMMU support >>> > > with virtual PCI device's DMA translation and interrupt remapping. We >>> > > are using KVM to do experiment of adding vIOMMU on the I440x and test >>> > > Linux/Windows guest. Will report back when have some results. > Any results? We have booted up Win8 guest with virtual VTD and emulated I440x platform on Xen and guest uses virtual VTD to enable interrupt remapping function. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH v2 0/2] Xen: Fix Xen hypervisor panic during dumping timer info on huge machine.
This patchset is to fix triggering NMI watchdog during dump timer info on the huge machine with a mount of physical cpus. Detail please see change log of Patch 1. Previous discussion: https://patchwork.kernel.org/patch/9328449/ Change since V1: Add "async" param for handle_keypress() to identify whether run nonirq keyhandler in tasklet or not. This is to avoid processing debugkey sysctl asynchronously. Lan Tianyu (2): Xen/Keyhandler: Rework process of nonirq keyhandler Xen/timer: Process softirq during dumping timer info xen/common/keyhandler.c |8 +--- xen/common/sysctl.c |2 +- xen/common/timer.c |1 + xen/drivers/char/console.c |2 +- xen/include/xen/keyhandler.h |4 +++- 5 files changed, 11 insertions(+), 6 deletions(-) ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH v2 1/2] Xen/Keyhandler: Rework process of nonirq keyhandler
Keyhandler may run for a long time in serial port driver's timer handler on the large machine with a lot of physical cpus(e,g dump_timerq()) when serial port driver works in the poll mode(via the exception mechanism). If a timer handler runs a long time, it will block nmi_timer_fn() to feed NMI watchdog and cause Xen hypervisor panic. Inserting process_pending_softirqs() in timer handler will not help. when timer interrupt arrives, timer subsystem calls all expired timer handlers before programming next timer interrupt. There is no timer interrupt arriving to trigger timer softirq during run a timer handler. This patch is to fix the issue to make nonirq keyhandler run in tasklet when receive debug key from serial port. Signed-off-by: Lan Tianyu --- xen/common/keyhandler.c |8 +--- xen/common/sysctl.c |2 +- xen/drivers/char/console.c |2 +- xen/include/xen/keyhandler.h |4 +++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 16de6e8..3d50041 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -75,19 +75,21 @@ static struct keyhandler { static void keypress_action(unsigned long unused) { -handle_keypress(keypress_key, NULL); +console_start_log_everything(); +key_table[keypress_key].fn(keypress_key); +console_end_log_everything(); } static DECLARE_TASKLET(keypress_tasklet, keypress_action, 0); -void handle_keypress(unsigned char key, struct cpu_user_regs *regs) +void handle_keypress(unsigned char key, struct cpu_user_regs *regs, bool async) { struct keyhandler *h; if ( key >= ARRAY_SIZE(key_table) || !(h = &key_table[key])->fn ) return; -if ( !in_irq() || h->irq_callback ) +if ( h->irq_callback || !async ) { console_start_log_everything(); h->irq_callback ? h->irq_fn(key, regs) : h->fn(key); diff --git a/xen/common/sysctl.c b/xen/common/sysctl.c index 8aea6ef..1eb7bad 100644 --- a/xen/common/sysctl.c +++ b/xen/common/sysctl.c @@ -136,7 +136,7 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl) { if ( copy_from_guest_offset(&c, op->u.debug_keys.keys, i, 1) ) goto out; -handle_keypress(c, guest_cpu_user_regs()); +handle_keypress(c, guest_cpu_user_regs(), false); } ret = 0; copyback = 0; diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index 55ae31a..184b523 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -347,7 +347,7 @@ static void switch_serial_input(void) static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) -return handle_keypress(c, regs); +return handle_keypress(c, regs, true); /* Deliver input to guest buffer, unless it is already full. */ if ( (serial_rx_prod-serial_rx_cons) != SERIAL_RX_SIZE ) diff --git a/xen/include/xen/keyhandler.h b/xen/include/xen/keyhandler.h index 06c05c8..e9595bd 100644 --- a/xen/include/xen/keyhandler.h +++ b/xen/include/xen/keyhandler.h @@ -46,7 +46,9 @@ void register_irq_keyhandler(unsigned char key, bool_t diagnostic); /* Inject a keypress into the key-handling subsystem. */ -extern void handle_keypress(unsigned char key, struct cpu_user_regs *regs); +extern void handle_keypress(unsigned char key, + struct cpu_user_regs *regs, + bool async); /* Scratch space is available for use of any keyhandler. */ extern char keyhandler_scratch[1024]; -- 1.7.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH v2 2/2] Xen/timer: Process softirq during dumping timer info
Dumping timer info may run for a long time on the huge machine with a lot of physical cpus. To avoid triggering NMI watchdog, add process_pending_softirqs() in the loop of dumping timer info. Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Lan Tianyu --- xen/common/timer.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/xen/common/timer.c b/xen/common/timer.c index 29a60a9..ab6bca0 100644 --- a/xen/common/timer.c +++ b/xen/common/timer.c @@ -530,6 +530,7 @@ static void dump_timerq(unsigned char key) { ts = &per_cpu(timers, i); +process_pending_softirqs(); printk("CPU%02d:\n", i); spin_lock_irqsave(&ts->lock, flags); for ( j = 1; j <= GET_HEAP_SIZE(ts->heap); j++ ) -- 1.7.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v2 0/2] Xen: Fix Xen hypervisor panic during dumping timer info on huge machine.
On 2016年10月12日 16:09, Jan Beulich wrote: >>>> On 12.10.16 at 17:44, wrote: >> This patchset is to fix triggering NMI watchdog during dump timer info >> on the huge machine with a mount of physical cpus. Detail please see >> change log of Patch 1. >> >> Previous discussion: >> https://patchwork.kernel.org/patch/9328449/ >> >> Change since V1: >> Add "async" param for handle_keypress() to identify >> whether run nonirq keyhandler in tasklet or not. This is to >> avoid processing debugkey sysctl asynchronously. >> >> >> Lan Tianyu (2): >> Xen/Keyhandler: Rework process of nonirq keyhandler >> Xen/timer: Process softirq during dumping timer info > > This second patch went in already a few days ago. > Oh. Sorry for noise. I didn't notice that. > Also, any reason you send to the list twice (once @lists.xen.org, > and another time to @lists.xenproject.org)? Sometime I found my patches wasn't able to arrive xen-devel and so send to both xen.org and xenproject.org maillist. I will double check. > > Jan > -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] Discussion about virtual iommu support for Xen guest
Hi All: We try pushing virtual iommu support for Xen guest and there are some features blocked by it. Motivation: --- 1) Add SVM(Shared Virtual Memory) support for Xen guest To support iGFX pass-through for SVM enabled devices, it requires virtual iommu support to emulate related registers and intercept/handle guest SVM configure in the VMM. 2) Increase max vcpu support for one VM. So far, max vcpu for Xen hvm guest is 128. For HPC(High Performance Computing) cloud computing, it requires more vcpus support in a single VM. The usage model is to create just one VM on a machine with the same number vcpus as logical cpus on the host and pin vcpu on each logical cpu in order to get good compute performance. Intel Xeon phi KNL(Knights Landing) is dedicated to HPC market and supports 288 logical cpus. So we hope VM can support 288 vcpu to meet HPC requirement. Current Linux kernel requires IR(interrupt remapping) when MAX APIC ID is > 255 because interrupt only can be delivered among 0~255 cpus without IR. IR in VM relies on the virtual iommu support. KVM Virtual iommu support status Current, Qemu has a basic virtual iommu to do address translation for virtual device and it only works for the Q35 machine type. KVM reuses it and Redhat is adding IR to support more than 255 vcpus. How to add virtual iommu for Xen? - First idea came to my mind is to reuse Qemu virtual iommu but Xen didn't support Q35 so far. Enabling Q35 for Xen seems not a short term task. Anthony did some related jobs before. I'd like to see your comments about how to implement virtual iommu for Xen. 1) Reuse Qemu virtual iommu or write a separate one for Xen? 2) Enable Q35 for Xen to reuse Qemu virtual iommu? Your comments are very appreciated. Thanks a lot. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
On 2016年05月26日 16:42, Dong, Eddie wrote: > If enabling virtual Q35 solves the problem, it has the advantage: When more > and more virtual IOMMU feature comes (likely), we can reuse the KVM code for > Xen. > How big is the effort for virtual Q35? I think the most effort are to rebuild all ACPI tables for Q35 and add Q35 support in the hvmloader. My concern is about new ACPI tables' compatibility issue. Especially with Windows guest. -- Best regards Tianyu Lan > > Thx Eddie > ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
On 2016年05月26日 19:35, Andrew Cooper wrote: > On 26/05/16 09:29, Lan Tianyu wrote: > > To be viable going forwards, any solution must work with PVH/HVMLite as > much as HVM. This alone negates qemu as a viable option. > > From a design point of view, having Xen needing to delegate to qemu to > inject an interrupt into a guest seems backwards. > Sorry, I am not familiar with HVMlite. HVMlite doesn't use Qemu and the qemu virtual iommu can't work for it. We have to rewrite virtual iommu in the Xen, right? > > A whole lot of this would be easier to reason about if/when we get a > basic root port implementation in Xen, which is necessary for HVMLite, > and which will make the interaction with qemu rather more clean. It is > probably worth coordinating work in this area. The virtual iommu also should be under basic root port in Xen, right? > > As for the individual issue of 288vcpu support, there are already issues > with 64vcpu guests at the moment. While it is certainly fine to remove > the hard limit at 255 vcpus, there is a lot of other work required to > even get 128vcpu guests stable. Could you give some points to these issues? We are enabling more vcpus support and it can boot up 255 vcpus without IR support basically. It's very helpful to learn about known issues. We will also add more tests for 128 vcpus into our regular test to find related bugs. Increasing max vcpu to 255 should be a good start. > > ~Andrew > -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
On 5/27/2016 4:19 PM, Lan Tianyu wrote: On 2016年05月26日 19:35, Andrew Cooper wrote: On 26/05/16 09:29, Lan Tianyu wrote: To be viable going forwards, any solution must work with PVH/HVMLite as much as HVM. This alone negates qemu as a viable option. From a design point of view, having Xen needing to delegate to qemu to inject an interrupt into a guest seems backwards. Sorry, I am not familiar with HVMlite. HVMlite doesn't use Qemu and the qemu virtual iommu can't work for it. We have to rewrite virtual iommu in the Xen, right? A whole lot of this would be easier to reason about if/when we get a basic root port implementation in Xen, which is necessary for HVMLite, and which will make the interaction with qemu rather more clean. It is probably worth coordinating work in this area. The virtual iommu also should be under basic root port in Xen, right? As for the individual issue of 288vcpu support, there are already issues with 64vcpu guests at the moment. While it is certainly fine to remove the hard limit at 255 vcpus, there is a lot of other work required to even get 128vcpu guests stable. Could you give some points to these issues? We are enabling more vcpus support and it can boot up 255 vcpus without IR support basically. It's very helpful to learn about known issues. We will also add more tests for 128 vcpus into our regular test to find related bugs. Increasing max vcpu to 255 should be a good start. Hi Andrew: Could you give more inputs about issues with 64 vcpus and what needs to be done to make 128vcpu guest stable? We hope to do somethings to improve them. What's progress of PCI host bridge in Xen? From your opinion, we should do that first, right? Thanks. ~Andrew ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
On 6/3/2016 7:17 PM, Tian, Kevin wrote: From: Andrew Cooper [mailto:andrew.coop...@citrix.com] Sent: Friday, June 03, 2016 2:59 AM On 02/06/16 16:03, Lan, Tianyu wrote: On 5/27/2016 4:19 PM, Lan Tianyu wrote: On 2016年05月26日 19:35, Andrew Cooper wrote: On 26/05/16 09:29, Lan Tianyu wrote: To be viable going forwards, any solution must work with PVH/HVMLite as much as HVM. This alone negates qemu as a viable option. From a design point of view, having Xen needing to delegate to qemu to inject an interrupt into a guest seems backwards. Sorry, I am not familiar with HVMlite. HVMlite doesn't use Qemu and the qemu virtual iommu can't work for it. We have to rewrite virtual iommu in the Xen, right? A whole lot of this would be easier to reason about if/when we get a basic root port implementation in Xen, which is necessary for HVMLite, and which will make the interaction with qemu rather more clean. It is probably worth coordinating work in this area. The virtual iommu also should be under basic root port in Xen, right? As for the individual issue of 288vcpu support, there are already issues with 64vcpu guests at the moment. While it is certainly fine to remove the hard limit at 255 vcpus, there is a lot of other work required to even get 128vcpu guests stable. Could you give some points to these issues? We are enabling more vcpus support and it can boot up 255 vcpus without IR support basically. It's very helpful to learn about known issues. We will also add more tests for 128 vcpus into our regular test to find related bugs. Increasing max vcpu to 255 should be a good start. Hi Andrew: Could you give more inputs about issues with 64 vcpus and what needs to be done to make 128vcpu guest stable? We hope to do somethings to improve them. What's progress of PCI host bridge in Xen? From your opinion, we should do that first, right? Thanks. Very sorry for the delay. There are multiple interacting issues here. On the one side, it would be useful if we could have a central point of coordination on PVH/HVMLite work. Roger - as the person who last did HVMLite work, would you mind organising that? For the qemu/xen interaction, the current state is woeful and a tangled mess. I wish to ensure that we don't make any development decisions which makes the situation worse. In your case, the two motivations are quite different I would recommend dealing with them independently. IIRC, the issue with more than 255 cpus and interrupt remapping is that you can only use x2apic mode with more than 255 cpus, and IOAPIC RTEs can't be programmed to generate x2apic interrupts? In principle, if you don't have an IOAPIC, are there any other issues to be considered? What happens if you configure the LAPICs in x2apic mode, but have the IOAPIC deliver xapic interrupts? The key is the APIC ID. There is no modification to existing PCI MSI and IOAPIC with the introduction of x2apic. PCI MSI/IOAPIC can only send interrupt message containing 8bit APIC ID, which cannot address >255 cpus. Interrupt remapping supports 32bit APIC ID so it's necessary to enable >255 cpus with x2apic mode. If LAPIC is in x2apic while interrupt remapping is disabled, IOAPIC cannot deliver interrupts to all cpus in the system if #cpu > 255. Another key factor, Linux kernel disables x2apic mode when MAX APIC id is > 255 if no interrupt remapping function. The reason for this is what Kevin said. So booting up >255 cpus relies on the interrupt remapping. ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
On 5/27/2016 4:19 PM, Lan Tianyu wrote: > As for the individual issue of 288vcpu support, there are already issues > with 64vcpu guests at the moment. While it is certainly fine to remove > the hard limit at 255 vcpus, there is a lot of other work required to > even get 128vcpu guests stable. Could you give some points to these issues? We are enabling more vcpus support and it can boot up 255 vcpus without IR support basically. It's very helpful to learn about known issues. Hi Andrew: We are designing vIOMMU support for Xen. Increasing vcpu from 128 to 255 also can be implemented parallelly since it doesn't need vIOMMU support. From your previous comment "there is a lot of other work required to even get 128vcpu guests stable", you have some concerns about stability of 128vcpus. I wonder what we need to do before starting work of increasing vcpu number from 128 to 255? ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] Xen virtual IOMMU high level design doc
Hi All: The following is our Xen vIOMMU high level design for detail discussion. Please have a look. Very appreciate for your comments. This design doesn't cover changes when root port is moved to hypervisor. We may design it later. Content: === 1. Motivation of vIOMMU 1.1 Enable more than 255 vcpus 1.2 Support VFIO-based user space driver 1.3 Support guest Shared Virtual Memory (SVM) 2. Xen vIOMMU Architecture 2.1 2th level translation overview 2.2 Interrupt remapping overview 3. Xen hypervisor 3.1 New vIOMMU hypercall interface 3.2 2nd level translation 3.3 Interrupt remapping 3.4 1st level translation 3.5 Implementation consideration 4. Qemu 4.1 Qemu vIOMMU framework 4.2 Dummy xen-vIOMMU driver 4.3 Q35 vs. i440x 4.4 Report vIOMMU to hvmloader 1 Motivation for Xen vIOMMU === 1.1 Enable more than 255 vcpu support HPC virtualization requires more than 255 vcpus support in a single VM to meet parallel computing requirement. More than 255 vcpus support requires interrupt remapping capability present on vIOMMU to deliver interrupt to #vcpu >255 Otherwise Linux guest fails to boot up with >255 vcpus if interrupt remapping is absent. 1.2 Support VFIO-based user space driver (e.g. DPDK) in the guest It relies on the 2nd level translation capability (IOVA->GPA) on vIOMMU. pIOMMU 2nd level becomes a shadowing structure of vIOMMU to isolate DMA requests initiated by user space driver. 1.3 Support guest SVM (Shared Virtual Memory) It relies on the 1st level translation table capability (GVA->GPA) on vIOMMU. pIOMMU needs to enable both 1st level and 2nd level translation in nested mode (GVA->GPA->HPA) for passthrough device. IGD passthrough is the main usage today (to support OpenCL 2.0 SVM feature). In the future SVM might be used by other I/O devices too. 2. Xen vIOMMU Architecture * vIOMMU will be inside Xen hypervisor for following factors 1) Avoid round trips between Qemu and Xen hypervisor 2) Ease of integration with the rest of the hypervisor 3) HVMlite/PVH doesn't use Qemu * Dummy xen-vIOMMU in Qemu as a wrapper of new hypercall to create /destory vIOMMU in hypervisor and deal with virtual PCI device's 2th level translation. 2.1 2th level translation overview For Virtual PCI device, dummy xen-vIOMMU does translation in the Qemu via new hypercall. For physical PCI device, vIOMMU in hypervisor shadows IO page table from IOVA->GPA to IOVA->HPA and load page table to physical IOMMU. The following diagram shows 2th level translation architecture. +-+ |Qemu++ | || Virtual| | || PCI device | | ||| | |++ | ||DMA | |V| | ++ Request ++ | | |+<---+| | | | Dummy xen vIOMMU | Target GPA | Memory region | | | |+--->+| | | +-+--++---++ | || || ||Hypercall || +++ |Hypervisor | || || || |v || | +--+--+|| | | vIOMMU||| | +--+--+|| || || |v || | +--+--+|| | | IOMMU driver||| | +--+--+|| || || +++ |HW v V| | +--+--+ +-+ | | | IOMMU +>+ Memory | | | +--+--+ +-+ | |^| ||| | +--+--+
Re: [Xen-devel] Xen virtual IOMMU high level design doc
On 8/17/2016 8:42 PM, Paul Durrant wrote: -Original Message- From: Xen-devel [mailto:xen-devel-boun...@lists.xen.org] On Behalf Of Lan, Tianyu Sent: 17 August 2016 13:06 To: Jan Beulich; Kevin Tian; Andrew Cooper; yang.zhang...@gmail.com; Jun Nakajima; Stefano Stabellini Cc: Anthony Perard; xuqu...@huawei.com; xen- de...@lists.xensource.com; Ian Jackson; Roger Pau Monne Subject: [Xen-devel] Xen virtual IOMMU high level design doc Hi All: The following is our Xen vIOMMU high level design for detail discussion. Please have a look. Very appreciate for your comments. This design doesn't cover changes when root port is moved to hypervisor. We may design it later. Content: == = 1. Motivation of vIOMMU 1.1 Enable more than 255 vcpus 1.2 Support VFIO-based user space driver 1.3 Support guest Shared Virtual Memory (SVM) 2. Xen vIOMMU Architecture 2.1 2th level translation overview 2.2 Interrupt remapping overview 3. Xen hypervisor 3.1 New vIOMMU hypercall interface Would it not have been better to build on the previously discussed (and mostly agreed) PV IOMMU interface? (See https://lists.xenproject.org/archives/html/xen-devel/2016-02/msg01428.html). An RFC implementation series was also posted (https://lists.xenproject.org/archives/html/xen-devel/2016-02/msg01441.html). Paul Hi Paul: Thanks for your input. Glance the patchset and it introduces hypercall "HYPERVISOR_iommu_op". The hypercall just works for PV IOMMU now. We may abstract it and make it work for both PV and Virtual IOMMU. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc
Hi Jan: Sorry for later response. Thanks a lot for your comments. On 2016年08月25日 19:11, Jan Beulich wrote: On 17.08.16 at 14:05, wrote: >> 1 Motivation for Xen vIOMMU >> >> === >> 1.1 Enable more than 255 vcpu support >> HPC virtualization requires more than 255 vcpus support in a single VM >> to meet parallel computing requirement. More than 255 vcpus support >> requires interrupt remapping capability present on vIOMMU to deliver >> interrupt to #vcpu >255 Otherwise Linux guest fails to boot up with >255 >> vcpus if interrupt remapping is absent. > > I continue to question this as a valid motivation at this point in > time, for the reasons Andrew has been explaining. If we want to support Linux guest with >255 vcpus, interrupt remapping is necessary. From Linux commit introducing x2apic and IR mode, it said IR was a pre-requisite for enabling x2apic mode in the CPU. https://lwn.net/Articles/289881/ So far, no sure behavior on the other OS. We may watch Windows guest behavior later on KVM and there is still a bug to run Windows guest with IR function on KVM. > >> 2. Xen vIOMMU Architecture >> >> >> >> * vIOMMU will be inside Xen hypervisor for following factors >> 1) Avoid round trips between Qemu and Xen hypervisor >> 2) Ease of integration with the rest of the hypervisor >> 3) HVMlite/PVH doesn't use Qemu >> * Dummy xen-vIOMMU in Qemu as a wrapper of new hypercall to create >> /destory vIOMMU in hypervisor and deal with virtual PCI device's 2th >> level translation. > > How does the create/destroy part of this match up with 3) right > ahead of it? The create/destroy hypercalls will work for both hvm and hvmlite. Suppose hvmlite has tool stack(E.G libxl) which can call new hypercalls to create or destroy virtual iommu in hypervisor. > >> 3 Xen hypervisor >> == >> >> 3.1 New hypercall XEN_SYSCTL_viommu_op >> 1) Definition of "struct xen_sysctl_viommu_op" as new hypercall parameter. >> >> struct xen_sysctl_viommu_op { >> u32 cmd; >> u32 domid; >> union { >> struct { >> u32 capabilities; >> } query_capabilities; >> struct { >> u32 capabilities; >> u64 base_address; >> } create_iommu; >> struct { >> u8 bus; >> u8 devfn; > > Please can we avoid introducing any new interfaces without segment/ > domain value, even if for now it'll be always zero? Sure. Will add segment field. > >> u64 iova; >> u64 translated_addr; >> u64 addr_mask; /* Translation page size */ >> IOMMUAccessFlags permisson; >> } 2th_level_translation; > > I suppose "translated_addr" is an output here, but for the following > fields this already isn't clear. Please add IN and OUT annotations for > clarity. > > Also, may I suggest to name this "l2_translation"? (But there are > other implementation specific things to be considered here, which > I guess don't belong into a design doc discussion.) How about this? struct { /* IN parameters. */ u8 segment; u8 bus; u8 devfn; u64 iova; /* Out parameters. */ u64 translated_addr; u64 addr_mask; /* Translation page size */ IOMMUAccessFlags permisson; } l2_translation; > >> }; >> >> typedef enum { >> IOMMU_NONE = 0, >> IOMMU_RO = 1, >> IOMMU_WO = 2, >> IOMMU_RW = 3, >> } IOMMUAccessFlags; >> >> >> Definition of VIOMMU subops: >> #define XEN_SYSCTL_viommu_query_capability 0 >> #define XEN_SYSCTL_viommu_create 1 >> #define XEN_SYSCTL_viommu_destroy2 >> #define XEN_SYSCTL_viommu_dma_translation_for_vpdev 3 >> >> Definition of VIOMMU capabilities >> #define XEN_VIOMMU_CAPABILITY_1nd_level_translation (1 << 0) >> #define XEN_VIOMMU_CAPABILITY_2nd_level_translation (1 << 1) > > l1 and l2 respectively again, please. Will update. > >> 3.3 Interrupt remapping >> Interrupts from virtual devices and physical devices will be delivered >> to vlapic from vIOAPIC and vMSI. It needs to add interrupt remapping >> hooks in the vmsi_deliver() and ioapic_deliver() to find target vlapic >> according interrupt remapping table. The following diagram shows the logic. > > Missing diagram or stale sentence? Sorry. It's stale sentence and moved the diagram to 2.2 Interrupt remapping overview. > >> 3.5 Implementation consideration >> Linux Intel IOMMU driver will fail to be loaded without 2th level >> translation support even if interrupt remapping and 1th le
Re: [Xen-devel] Xen virtual IOMMU high level design doc
On 2016年08月31日 20:02, Jan Beulich wrote: On 31.08.16 at 10:39, wrote: >> > On 2016年08月25日 19:11, Jan Beulich wrote: >> > On 17.08.16 at 14:05, wrote: >>> 1 Motivation for Xen vIOMMU >>> >>> === >>> 1.1 Enable more than 255 vcpu support >>> HPC virtualization requires more than 255 vcpus support in a single VM >>> to meet parallel computing requirement. More than 255 vcpus support >>> requires interrupt remapping capability present on vIOMMU to deliver >>> interrupt to #vcpu >255 Otherwise Linux guest fails to boot up with >>> >255 >>> vcpus if interrupt remapping is absent. >>> >> >>> >> I continue to question this as a valid motivation at this point in >>> >> time, for the reasons Andrew has been explaining. >> > >> > If we want to support Linux guest with >255 vcpus, interrupt remapping >> > is necessary. > I don't understand why you keep repeating this, without adding > _why_ you think there is a demand for such guests and _what_ > your plans are to eliminate Andrew's concerns. > The motivation for such huge VM is for HPC(High-performance computing) Cloud service which requires high performance parallel computing. We just create single VM on one machine and expose more than 255 pcpus to VM in order to make sure high performance parallel computing in VM. One vcpu is pinged on pcpu. For performance, we achieved high performance data(>95% native data of stream, dgemm and sgemm benchmarks in VM) after some tuning and optimizations. We presented these on Xen summit of this year. For stability, Andrew found some issues of huge VM with watchdog enabled and cause hypervisor reboot. We will reproduce and fix them. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v2 1/2] Xen/Keyhandler: Rework process of nonirq keyhandler
On 10/12/2016 9:19 PM, Jan Beulich wrote: On 12.10.16 at 09:58, wrote: --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -347,7 +347,7 @@ static void switch_serial_input(void) static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) -return handle_keypress(c, regs); +return handle_keypress(c, regs, true); I think it would be nice to pass true here only when in polling mode, unless you know or can deduce that the a similar problem also exists in IRQ mode. Perhaps you could simply move the !in_irq() here? That's a good idea. Thanks. (Of course the new function parameter would then want to be renamed.) Since the issue happens when handle_keypress() runs in a timer handler, how about to name new parameter "intimer"? __serial_rx() is called in a timer handler or interrupt handler. Or do you have other suggestion? Jan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v2 0/2] Xen: Fix Xen hypervisor panic during dumping timer info on huge machine.
On 10/12/2016 7:08 PM, Ian Jackson wrote: Wei Liu writes ("Re: [PATCH v2 0/2] Xen: Fix Xen hypervisor panic during dumping timer info on huge machine."): On Wed, Oct 12, 2016 at 04:20:02PM +0800, Lan Tianyu wrote: On 2016年10月12日 16:09, Jan Beulich wrote: Also, any reason you send to the list twice (once @lists.xen.org, and another time to @lists.xenproject.org)? Sometime I found my patches wasn't able to arrive xen-devel and so send to both xen.org and xenproject.org maillist. I will double check. Both addresses should work. There are glitches from time to time though. So do report to us if this happens again. I don't think either address is likely to work differently or separately to the other. So please just send to one, and if it doesn't work, please report it and we will try to fix it. Ok. I get it. Thanks, Ian. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v2 1/2] Xen/Keyhandler: Rework process of nonirq keyhandler
On 2016年10月13日 00:03, Jan Beulich wrote: On 12.10.16 at 16:30, wrote: >> >> Since the issue happens when handle_keypress() runs in a timer handler, >> how about to name new parameter "intimer"? __serial_rx() is called in a >> timer handler or interrupt handler. Or do you have other suggestion? > > I think "intimer" can be confusing (to be mixed up with timer interrupt). > How about "force_tasklet"? OK. I will update. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3] Xen/Keyhandler: Rework process of nonirq keyhandler
Keyhandler may run for a long time in serial port driver's timer handler on the large machine with a lot of physical cpus(e,g dump_timerq()) when serial port driver works in the poll mode(via the exception mechanism). If a timer handler runs a long time, it will block nmi_timer_fn() to feed NMI watchdog and cause Xen hypervisor panic. Inserting process_pending_softirqs() in timer handler will not help. when timer interrupt arrives, timer subsystem calls all expired timer handlers before programming next timer interrupt. There is no timer interrupt arriving to trigger timer softirq during run a timer handler. This patch is to fix the issue to make nonirq keyhandler run in tasklet when receive debug key from serial port. Signed-off-by: Lan Tianyu --- xen/common/keyhandler.c |8 +--- xen/common/sysctl.c |2 +- xen/drivers/char/console.c |2 +- xen/include/xen/keyhandler.h |4 +++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 16de6e8..005ef99 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -75,19 +75,21 @@ static struct keyhandler { static void keypress_action(unsigned long unused) { -handle_keypress(keypress_key, NULL); +console_start_log_everything(); +key_table[keypress_key].fn(keypress_key); +console_end_log_everything(); } static DECLARE_TASKLET(keypress_tasklet, keypress_action, 0); -void handle_keypress(unsigned char key, struct cpu_user_regs *regs) +void handle_keypress(unsigned char key, struct cpu_user_regs *regs, bool force_tasklet) { struct keyhandler *h; if ( key >= ARRAY_SIZE(key_table) || !(h = &key_table[key])->fn ) return; -if ( !in_irq() || h->irq_callback ) +if ( h->irq_callback || !force_tasklet ) { console_start_log_everything(); h->irq_callback ? h->irq_fn(key, regs) : h->fn(key); diff --git a/xen/common/sysctl.c b/xen/common/sysctl.c index 8aea6ef..1eb7bad 100644 --- a/xen/common/sysctl.c +++ b/xen/common/sysctl.c @@ -136,7 +136,7 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl) { if ( copy_from_guest_offset(&c, op->u.debug_keys.keys, i, 1) ) goto out; -handle_keypress(c, guest_cpu_user_regs()); +handle_keypress(c, guest_cpu_user_regs(), false); } ret = 0; copyback = 0; diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index 55ae31a..b0f74ce 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -347,7 +347,7 @@ static void switch_serial_input(void) static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) -return handle_keypress(c, regs); +return handle_keypress(c, regs, !in_irq()); /* Deliver input to guest buffer, unless it is already full. */ if ( (serial_rx_prod-serial_rx_cons) != SERIAL_RX_SIZE ) diff --git a/xen/include/xen/keyhandler.h b/xen/include/xen/keyhandler.h index 06c05c8..e9595bd 100644 --- a/xen/include/xen/keyhandler.h +++ b/xen/include/xen/keyhandler.h @@ -46,7 +46,9 @@ void register_irq_keyhandler(unsigned char key, bool_t diagnostic); /* Inject a keypress into the key-handling subsystem. */ -extern void handle_keypress(unsigned char key, struct cpu_user_regs *regs); +extern void handle_keypress(unsigned char key, + struct cpu_user_regs *regs, + bool async); /* Scratch space is available for use of any keyhandler. */ extern char keyhandler_scratch[1024]; -- 1.7.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] Xen virtual IOMMU high level design doc V2
Change since V1: 1) Update motivation for Xen vIOMMU - 288 vcpus support part 2) Change definition of struct xen_sysctl_viommu_op 3) Update "3.5 Implementation consideration" to explain why we needs to enable l2 translation first. 4) Update "4.3 Q35 vs I440x" - Linux/Windows VTD drivers can work on the emulated I440 chipset. 5) Remove stale statement in the "3.3 Interrupt remapping" Content: === 1. Motivation of vIOMMU 1.1 Enable more than 255 vcpus 1.2 Support VFIO-based user space driver 1.3 Support guest Shared Virtual Memory (SVM) 2. Xen vIOMMU Architecture 2.1 l2 translation overview 2.2 Interrupt remapping overview 3. Xen hypervisor 3.1 New vIOMMU hypercall interface 3.2 l2 translation 3.3 Interrupt remapping 3.4 l1 translation 3.5 Implementation consideration 4. Qemu 4.1 Qemu vIOMMU framework 4.2 Dummy xen-vIOMMU driver 4.3 Q35 vs. i440x 4.4 Report vIOMMU to hvmloader 1 Motivation for Xen vIOMMU === 1.1 Enable more than 255 vcpu support HPC cloud service requires VM provides high performance parallel computing and we hope to create a huge VM with >255 vcpu on one machine to meet such requirement.Ping each vcpus on separated pcpus. More than 255 vcpus support requires X2APIC and Linux disables X2APIC mode if there is no interrupt remapping function which is present by vIOMMU. Interrupt remapping function helps to deliver interrupt to #vcpu >255. So we need to add vIOMMU before enabling >255 vcpus. 1.2 Support VFIO-based user space driver (e.g. DPDK) in the guest It relies on the l2 translation capability (IOVA->GPA) on vIOMMU. pIOMMU l2 becomes a shadowing structure of vIOMMU to isolate DMA requests initiated by user space driver. 1.3 Support guest SVM (Shared Virtual Memory) It relies on the l1 translation table capability (GVA->GPA) on vIOMMU. pIOMMU needs to enable both l1 and l2 translation in nested mode (GVA->GPA->HPA) for passthrough device. IGD passthrough is the main usage today (to support OpenCL 2.0 SVM feature). In the future SVM might be used by other I/O devices too. 2. Xen vIOMMU Architecture * vIOMMU will be inside Xen hypervisor for following factors 1) Avoid round trips between Qemu and Xen hypervisor 2) Ease of integration with the rest of the hypervisor 3) HVMlite/PVH doesn't use Qemu * Dummy xen-vIOMMU in Qemu as a wrapper of new hypercall to create /destory vIOMMU in hypervisor and deal with virtual PCI device's l2 translation. 2.1 l2 translation overview For Virtual PCI device, dummy xen-vIOMMU does translation in the Qemu via new hypercall. For physical PCI device, vIOMMU in hypervisor shadows IO page table from IOVA->GPA to IOVA->HPA and load page table to physical IOMMU. The following diagram shows l2 translation architecture. +-+ |Qemu++ | || Virtual| | || PCI device | | ||| | |++ | ||DMA | |V| | ++ Request ++ | | |+<---+| | | | Dummy xen vIOMMU | Target GPA | Memory region | | | |+--->+| | | +-+--++---++ | || || ||Hypercall || +++ |Hypervisor | || || || |v || | +--+--+|| | | vIOMMU||| | +--+--+|| || || |v || | +--+--+|| | | IOMMU driver||| | +--+--+|| || || +++ |HW v V| | +--+--+ +-+ | | | IOMMU +-
Re: [Xen-devel] Xen virtual IOMMU high level design doc V2
Hi Andrew: Thanks for your review. On 2016年10月19日 03:17, Andrew Cooper wrote: On 18/10/16 15:14, Lan Tianyu wrote: Change since V1: 1) Update motivation for Xen vIOMMU - 288 vcpus support part 2) Change definition of struct xen_sysctl_viommu_op 3) Update "3.5 Implementation consideration" to explain why we needs to enable l2 translation first. 4) Update "4.3 Q35 vs I440x" - Linux/Windows VTD drivers can work on the emulated I440 chipset. 5) Remove stale statement in the "3.3 Interrupt remapping" Content: === 1. Motivation of vIOMMU 1.1 Enable more than 255 vcpus 1.2 Support VFIO-based user space driver 1.3 Support guest Shared Virtual Memory (SVM) 2. Xen vIOMMU Architecture 2.1 l2 translation overview 2.2 Interrupt remapping overview 3. Xen hypervisor 3.1 New vIOMMU hypercall interface 3.2 l2 translation 3.3 Interrupt remapping 3.4 l1 translation 3.5 Implementation consideration 4. Qemu 4.1 Qemu vIOMMU framework 4.2 Dummy xen-vIOMMU driver 4.3 Q35 vs. i440x 4.4 Report vIOMMU to hvmloader 1 Motivation for Xen vIOMMU === 1.1 Enable more than 255 vcpu support HPC cloud service requires VM provides high performance parallel computing and we hope to create a huge VM with >255 vcpu on one machine to meet such requirement.Ping each vcpus on separated pcpus. More than Pin ? Sorry, it's a typo. Also, grammatically speaking, I think you mean "each vcpu to separate pcpus". Yes. 255 vcpus support requires X2APIC and Linux disables X2APIC mode if there is no interrupt remapping function which is present by vIOMMU. Interrupt remapping function helps to deliver interrupt to #vcpu >255. This is only a requirement for xapic interrupt sources. x2apic interrupt sources already deliver correctly. The key is the APIC ID. There is no modification to existing PCI MSI and IOAPIC with the introduction of x2apic. PCI MSI/IOAPIC can only send interrupt message containing 8bit APIC ID, which cannot address >255 cpus. Interrupt remapping supports 32bit APIC ID so it's necessary to enable >255 cpus with x2apic mode. If LAPIC is in x2apic while interrupt remapping is disabled, IOAPIC cannot deliver interrupts to all cpus in the system if #cpu > 255. 1.3 Support guest SVM (Shared Virtual Memory) It relies on the l1 translation table capability (GVA->GPA) on vIOMMU. pIOMMU needs to enable both l1 and l2 translation in nested mode (GVA->GPA->HPA) for passthrough device. IGD passthrough is the main usage today (to support OpenCL 2.0 SVM feature). In the future SVM might be used by other I/O devices too. As an aside, how is IGD intending to support SVM? Will it be with PCIe ATS/PASID, or something rather more magic as IGD is on the same piece of silicon? IGD on Skylake supports PCIe PASID. 2. Xen vIOMMU Architecture * vIOMMU will be inside Xen hypervisor for following factors 1) Avoid round trips between Qemu and Xen hypervisor 2) Ease of integration with the rest of the hypervisor 3) HVMlite/PVH doesn't use Qemu * Dummy xen-vIOMMU in Qemu as a wrapper of new hypercall to create /destory vIOMMU in hypervisor and deal with virtual PCI device's l2 translation. 2.1 l2 translation overview For Virtual PCI device, dummy xen-vIOMMU does translation in the Qemu via new hypercall. For physical PCI device, vIOMMU in hypervisor shadows IO page table from IOVA->GPA to IOVA->HPA and load page table to physical IOMMU. The following diagram shows l2 translation architecture. Which scenario is this? Is this the passthrough case where the Qemu Virtual PCI device is a shadow of the real PCI device in hardware? No, this is for traditional virtual pci device emulated by Qemu and passthough PCI device. +-+ |Qemu++ | || Virtual| | || PCI device | | ||| | |++ | ||DMA | |V| | ++ Request ++ | | |+<---+| | | | Dummy xen vIOMMU | Target GPA | Memory region | | | |+--->+| | | +-+--++---++ | || || ||Hypercall
Re: [Xen-devel] Xen virtual IOMMU high level design doc V2
On 10/19/2016 4:26 AM, Konrad Rzeszutek Wilk wrote: On Tue, Oct 18, 2016 at 10:14:16PM +0800, Lan Tianyu wrote: 1 Motivation for Xen vIOMMU === 1.1 Enable more than 255 vcpu support HPC cloud service requires VM provides high performance parallel computing and we hope to create a huge VM with >255 vcpu on one machine to meet such requirement.Ping each vcpus on separated pcpus. More than 255 vcpus support requires X2APIC and Linux disables X2APIC mode if there is no interrupt remapping function which is present by vIOMMU. Interrupt remapping function helps to deliver interrupt to #vcpu >255. So we need to add vIOMMU before enabling >255 vcpus. What about Windows? Does it care about this? From our test, win8 guest crashes when boot up 288 vcpus without IR and it can boot up with IR 3.2 l2 translation 1) For virtual PCI device Xen dummy xen-vIOMMU in Qemu translates IOVA to target GPA via new hypercall when DMA operation happens. 2) For physical PCI device DMA operations go though physical IOMMU directly and IO page table for IOVA->HPA should be loaded into physical IOMMU. When guest updates l2 Page-table pointer field, it provides IO page table for IOVA->GPA. vIOMMU needs to shadow l2 translation table, translate GPA->HPA and update shadow page table(IOVA->HPA) pointer to l2 Page-table pointer to context entry of physical IOMMU. Now all PCI devices in same hvm domain share one IO page table (GPA->HPA) in physical IOMMU driver of Xen. To support l2 translation of vIOMMU, IOMMU driver need to support multiple address spaces per device entry. Using existing IO page table(GPA->HPA) defaultly and switch to shadow IO page table(IOVA->HPA) when l2 defaultly? I mean GPA->HPA mapping will set in the assigned device's context entry of pIOMMU when VM creates. Just like current code works. 3.3 Interrupt remapping Interrupts from virtual devices and physical devices will be delivered to vlapic from vIOAPIC and vMSI. It needs to add interrupt remapping hooks in the vmsi_deliver() and ioapic_deliver() to find target vlapic according interrupt remapping table. 3.4 l1 translation When nested translation is enabled, any address generated by l1 translation is used as the input address for nesting with l2 translation. Physical IOMMU needs to enable both l1 and l2 translation in nested translation mode(GVA->GPA->HPA) for passthrough device. VT-d context entry points to guest l1 translation table which will be nest-translated by l2 translation table and so it can be directly linked to context entry of physical IOMMU. I think this means that the shared_ept will be disabled? The shared_ept(GPA->HPA mapping) is used to do nested translation for any output from l1 translation(GVA->GPA). ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH] Xen: Force non-irq keyhandler to be run in tasklet when receive a debugkey from serial port
__serial_rx() runs in either irq handler or timer handler and non-irq keyhandler should not run in these contexts. So always force non-irq keyhandler to run in tasklet when receive a debugkey from serial port Signed-off-by: Lan Tianyu --- xen/drivers/char/console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index b0f74ce..184b523 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -347,7 +347,7 @@ static void switch_serial_input(void) static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) -return handle_keypress(c, regs, !in_irq()); +return handle_keypress(c, regs, true); /* Deliver input to guest buffer, unless it is already full. */ if ( (serial_rx_prod-serial_rx_cons) != SERIAL_RX_SIZE ) -- 2.9.3 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v2 2/2] Xen/timer: Process softirq during dumping timer info
On 10/22/2016 1:27 AM, Wei Liu wrote: On Wed, Oct 12, 2016 at 03:58:24PM +0800, Lan Tianyu wrote: Dumping timer info may run for a long time on the huge machine with a lot of physical cpus. To avoid triggering NMI watchdog, add process_pending_softirqs() in the loop of dumping timer info. Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Lan Tianyu --- xen/common/timer.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/xen/common/timer.c b/xen/common/timer.c index 29a60a9..ab6bca0 100644 --- a/xen/common/timer.c +++ b/xen/common/timer.c @@ -530,6 +530,7 @@ static void dump_timerq(unsigned char key) { ts = &per_cpu(timers, i); +process_pending_softirqs(); This is causing issues in ARM (x86 has a similar issue): Oct 20 01:43:31.410010 (XEN) Xen call trace: Oct 20 01:43:31.410048 (XEN)[<00233920>] process_pending_softirqs+0x34/0x5c (PC) Oct 20 01:43:31.417990 (XEN)[<00237c6c>] timer.c#dump_timerq+0x9c/0x1fc (LR) Oct 20 01:43:31.418030 (XEN)[<00218658>] handle_keypress+0xc0/0xf4 Oct 20 01:43:31.426001 (XEN)[<002490c8>] console.c#__serial_rx+0x4c/0x9c Oct 20 01:43:31.433970 (XEN)[<00249b74>] console.c#serial_rx+0xcc/0xe4 Oct 20 01:43:31.434007 (XEN)[<0024b6ec>] serial_rx_interrupt+0xcc/0xf8 Oct 20 01:43:31.441964 (XEN)[<0024ae54>] exynos4210-uart.c#exynos4210_uart_interrupt+0xf8/0x160 Oct 20 01:43:31.450001 (XEN)[<00256338>] do_IRQ+0x1a0/0x228 Oct 20 01:43:31.450040 (XEN)[<00254074>] gic_interrupt+0x58/0xfc Oct 20 01:43:31.457985 (XEN)[<00260f98>] do_trap_irq+0x24/0x38 Oct 20 01:43:31.458022 (XEN)[<00264970>] entry.o#return_from_trap+0/0x4 Oct 20 01:43:31.466010 (XEN)[<0030a240>] 0030a240 Oct 20 01:43:31.466044 (XEN) Oct 20 01:43:31.466066 (XEN) Oct 20 01:43:31.466099 (XEN) Oct 20 01:43:31.473998 (XEN) Panic on CPU 0: Oct 20 01:43:31.474029 (XEN) Assertion '!in_irq() && local_irq_is_enabled()' failed at softirq.c:57 Oct 20 01:43:31.481982 (XEN) See http://logs.test-lab.xenproject.org/osstest/logs/101571/test-armhf-armhf-libvirt/serial-arndale-bluewater.log I've reverted this patch in staging. Wei. dump_timerq() or other non-irq keyhandlers should not run in irq context and has sent out a fix patch. https://lists.xen.org/archives/html/xen-devel/2016-10/msg01391.html ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V2
On 10/21/2016 4:36 AM, Andrew Cooper wrote: 255 vcpus support requires X2APIC and Linux disables X2APIC mode if there is no interrupt remapping function which is present by vIOMMU. Interrupt remapping function helps to deliver interrupt to #vcpu >255. This is only a requirement for xapic interrupt sources. x2apic interrupt sources already deliver correctly. The key is the APIC ID. There is no modification to existing PCI MSI and IOAPIC with the introduction of x2apic. PCI MSI/IOAPIC can only send interrupt message containing 8bit APIC ID, which cannot address >255 cpus. Interrupt remapping supports 32bit APIC ID so it's necessary to enable >255 cpus with x2apic mode. If LAPIC is in x2apic while interrupt remapping is disabled, IOAPIC cannot deliver interrupts to all cpus in the system if #cpu > 255. After spending a long time reading up on this, my first observation is that it is very difficult to find consistent information concerning the expected content of MSI address/data fields for x86 hardware. Having said that, this has been very educational. It is now clear that any MSI message can either specify an 8 bit APIC ID directly, or request for the message to be remapped. Apologies for my earlier confusion. Never minder, I will describe this more detail in the following version. 3 Xen hypervisor == 3.1 New hypercall XEN_SYSCTL_viommu_op This hypercall should also support pv IOMMU which is still under RFC review. Here only covers non-pv part. 1) Definition of "struct xen_sysctl_viommu_op" as new hypercall parameter. Why did you choose sysctl? As these are per-domain, domctl would be a more logical choice. However, neither of these should be usable by Qemu, and we are trying to split out "normal qemu operations" into dmops which can be safely deprivileged. Do you know what's the status of dmop now? I just found some discussions about design in the maillist. We may use domctl first and move to dmop when it's ready? I believe Paul is looking into respin the series early in the 4.9 dev cycle. I expect it won't take long until they are submitted. Ok. I got it. Thanks for information. Definition of VIOMMU subops: #define XEN_SYSCTL_viommu_query_capability0 #define XEN_SYSCTL_viommu_create1 #define XEN_SYSCTL_viommu_destroy2 #define XEN_SYSCTL_viommu_dma_translation_for_vpdev 3 Definition of VIOMMU capabilities #define XEN_VIOMMU_CAPABILITY_l1_translation(1 << 0) #define XEN_VIOMMU_CAPABILITY_l2_translation(1 << 1) #define XEN_VIOMMU_CAPABILITY_interrupt_remapping(1 << 2) How are vIOMMUs going to be modelled to guests? On real hardware, they all seem to end associated with a PCI device of some sort, even if it is just the LPC bridge. This design just considers one vIOMMU has all PCI device under its specified PCI Segment. "INCLUDE_PCI_ALL" bit of DRHD struct is set for vIOMMU. Even if the first implementation only supports a single vIOMMU, please design the interface to cope with multiple. It will save someone having to go and break the API/ABI in the future when support for multiple vIOMMUs is needed. OK. I got. How do we deal with multiple vIOMMUs in a single guest? For multi-vIOMMU, we need to add new field in the struct iommu_op to designate device scope of vIOMMUs if they are under same PCI segment. This also needs to change DMAR table. 2) Design for subops - XEN_SYSCTL_viommu_query_capability Get vIOMMU capabilities(l1/l2 translation and interrupt remapping). - XEN_SYSCTL_viommu_create Create vIOMMU in Xen hypervisor with dom_id, capabilities and reg base address. - XEN_SYSCTL_viommu_destroy Destory vIOMMU in Xen hypervisor with dom_id as parameters. - XEN_SYSCTL_viommu_dma_translation_for_vpdev Translate IOVA to GPA for specified virtual PCI device with dom id, PCI device's bdf and IOVA and xen hypervisor returns translated GPA, address mask and access permission. 3.2 l2 translation 1) For virtual PCI device Xen dummy xen-vIOMMU in Qemu translates IOVA to target GPA via new hypercall when DMA operation happens. 2) For physical PCI device DMA operations go though physical IOMMU directly and IO page table for IOVA->HPA should be loaded into physical IOMMU. When guest updates l2 Page-table pointer field, it provides IO page table for IOVA->GPA. vIOMMU needs to shadow l2 translation table, translate GPA->HPA and update shadow page table(IOVA->HPA) pointer to l2 Page-table pointer to context entry of physical IOMMU. How are you proposing to do this shadowing? Do we need to trap and emulate all writes to the vIOMMU pagetables, or is there a better way to know when the mappings need invalidating? No, we don't need to trap all write to IO page table. From VTD spec 6.1, "Reporting the Caching Mode as Set for the virtual hardware requires the guest software to explicitly issue invalidatio
Re: [Xen-devel] [PATCH] Xen: Force non-irq keyhandler to be run in tasklet when receive a debugkey from serial port
On 10/24/2016 8:19 AM, Konrad Rzeszutek Wilk wrote: On Sat, Oct 22, 2016 at 07:23:03PM +0800, Lan Tianyu wrote: __serial_rx() runs in either irq handler or timer handler and non-irq keyhandler should not run in these contexts. So always force non-irq keyhandler to run in tasklet when receive a debugkey from serial port If the machine is hung with an IRQ handler being stuck, and one does 'Ctrl-Ax3` followed by 'C' .. which would not be invoked (as it is not an IRQ handler?? If serial port's interrupt still works in this case, the 'C' keyhandler kexec_crash() will be invoked in a tasklet. This behavior was changed by my patches if includes this patch. Signed-off-by: Lan Tianyu --- xen/drivers/char/console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index b0f74ce..184b523 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -347,7 +347,7 @@ static void switch_serial_input(void) static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) -return handle_keypress(c, regs, !in_irq()); +return handle_keypress(c, regs, true); /* Deliver input to guest buffer, unless it is already full. */ if ( (serial_rx_prod-serial_rx_cons) != SERIAL_RX_SIZE ) -- 2.9.3 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen: Force non-irq keyhandler to be run in tasklet when receive a debugkey from serial port
On 10/24/2016 6:53 PM, Jan Beulich wrote: On 22.10.16 at 13:23, wrote: __serial_rx() runs in either irq handler or timer handler and non-irq keyhandler should not run in these contexts. So always force non-irq keyhandler to run in tasklet when receive a debugkey from serial port Signed-off-by: Lan Tianyu --- xen/drivers/char/console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index b0f74ce..184b523 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -347,7 +347,7 @@ static void switch_serial_input(void) static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) -return handle_keypress(c, regs, !in_irq()); +return handle_keypress(c, regs, true); Together with one of your earlier patches having got reverted, I think we need to take a step back here instead of going back to what was requested to be changed from v2 of the original patch. In particular I assume that the problem you're trying to address is not limited to dump_timerq() - at least dump_runq() should be as problematic on many-CPU systems. I think the issue here is that my previous patch commit 610b4eda2c("keyhandler: rework process of nonirq keyhandler") makes non-irq keyhandler run in irq context. This is caused by input param "!in_irq()" which is false in irq context. handle_keypress() runs keyhandler synchronically. This patch fixes the issue. I think (and I vaguely recall possibly having said so during earlier review) that dump functions the output of which depends on CPU count should get modeled after dump_registers(), and it might be worth abstracting this in keyhandler.c. Yes, but this sounds like a new feature or framework rework rather than a fix patch. In any case quite likely the other patch of yours (which the one here basically modifies) may then also want to be reverted. I think patch "timer: process softirq during dumping timer" does right thing. The issue is triggered by previous patch. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen: Force non-irq keyhandler to be run in tasklet when receive a debugkey from serial port
On 10/24/2016 9:38 PM, Konrad Rzeszutek Wilk wrote: On Mon, Oct 24, 2016 at 09:29:53PM +0800, Lan, Tianyu wrote: On 10/24/2016 8:19 AM, Konrad Rzeszutek Wilk wrote: On Sat, Oct 22, 2016 at 07:23:03PM +0800, Lan Tianyu wrote: __serial_rx() runs in either irq handler or timer handler and non-irq keyhandler should not run in these contexts. So always force non-irq keyhandler to run in tasklet when receive a debugkey from serial port If the machine is hung with an IRQ handler being stuck, and one does 'Ctrl-Ax3` followed by 'C' .. which would not be invoked (as it is not an IRQ handler?? If serial port's interrupt still works in this case, the 'C' keyhandler kexec_crash() will be invoked in a tasklet. This behavior was changed by my patches if includes this patch. Right, but the tasklet won't get to run at that point - as for example the IRQ handler is stuck - so tasklets never get run? Or maybe they do on another CPU? If serial interrupt handler works, the cpu receiving serial port interrupt should work normally. Tasklet_schedule() in the handle_keypress() queues keyhandler tasklet to that cpu and tasklet also should get to run at that point. . ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen: Force non-irq keyhandler to be run in tasklet when receive a debugkey from serial port
On 10/24/2016 9:54 PM, Jan Beulich wrote: On 24.10.16 at 15:29, wrote: On 10/24/2016 8:19 AM, Konrad Rzeszutek Wilk wrote: On Sat, Oct 22, 2016 at 07:23:03PM +0800, Lan Tianyu wrote: __serial_rx() runs in either irq handler or timer handler and non-irq keyhandler should not run in these contexts. So always force non-irq keyhandler to run in tasklet when receive a debugkey from serial port If the machine is hung with an IRQ handler being stuck, and one does 'Ctrl-Ax3` followed by 'C' .. which would not be invoked (as it is not an IRQ handler?? If serial port's interrupt still works in this case, the 'C' keyhandler kexec_crash() will be invoked in a tasklet. This behavior was changed by my patches if includes this patch. Sorry. A typo. I meant the behavior wasn't changed by my patches. As indicated already by Konrad's reply, this is not going to be acceptable. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen: Force non-irq keyhandler to be run in tasklet when receive a debugkey from serial port
On 10/24/2016 10:31 PM, Jan Beulich wrote: On 24.10.16 at 16:15, wrote: On 10/24/2016 9:54 PM, Jan Beulich wrote: On 24.10.16 at 15:29, wrote: On 10/24/2016 8:19 AM, Konrad Rzeszutek Wilk wrote: On Sat, Oct 22, 2016 at 07:23:03PM +0800, Lan Tianyu wrote: __serial_rx() runs in either irq handler or timer handler and non-irq keyhandler should not run in these contexts. So always force non-irq keyhandler to run in tasklet when receive a debugkey from serial port If the machine is hung with an IRQ handler being stuck, and one does 'Ctrl-Ax3` followed by 'C' .. which would not be invoked (as it is not an IRQ handler?? If serial port's interrupt still works in this case, the 'C' keyhandler kexec_crash() will be invoked in a tasklet. This behavior was changed by my patches if includes this patch. Sorry. A typo. I meant the behavior wasn't changed by my patches. How was it not? The softirq machinery didn't get invoked in that case prior to your patch, afaict. Which softirq? You mean addiing process_pending_softirqs() in the dump_timerq()? ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH] Xen: Force non-irq keyhandler to be run in tasklet when receive a debugkey from serial port
On 10/24/2016 10:28 PM, Jan Beulich wrote: On 24.10.16 at 16:01, wrote: On 10/24/2016 6:53 PM, Jan Beulich wrote: On 22.10.16 at 13:23, wrote: __serial_rx() runs in either irq handler or timer handler and non-irq keyhandler should not run in these contexts. So always force non-irq keyhandler to run in tasklet when receive a debugkey from serial port Signed-off-by: Lan Tianyu --- xen/drivers/char/console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/drivers/char/console.c b/xen/drivers/char/console.c index b0f74ce..184b523 100644 --- a/xen/drivers/char/console.c +++ b/xen/drivers/char/console.c @@ -347,7 +347,7 @@ static void switch_serial_input(void) static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) -return handle_keypress(c, regs, !in_irq()); +return handle_keypress(c, regs, true); Together with one of your earlier patches having got reverted, I think we need to take a step back here instead of going back to what was requested to be changed from v2 of the original patch. In particular I assume that the problem you're trying to address is not limited to dump_timerq() - at least dump_runq() should be as problematic on many-CPU systems. I think the issue here is that my previous patch commit 610b4eda2c("keyhandler: rework process of nonirq keyhandler") makes non-irq keyhandler run in irq context. This is caused by input param "!in_irq()" which is false in irq context. handle_keypress() runs keyhandler synchronically. This patch fixes the issue. Not really - your earlier patch only moved the !in_irq() check, i.e. things continued to run in the same context they always did _except_ for the one special case you cared about. I supposed the special case you meant is to run keyhandler in timer handler. It's necessary to make any timer handler run in a short time otherwise it will trigger watchdog problem. Plus your other patch fixed the respective issue only for one individual handler, instead of generally. So you think adding process_pending_softirqs() in the keyhandler isn't general? But this is a common solution so far. I think (and I vaguely recall possibly having said so during earlier review) that dump functions the output of which depends on CPU count should get modeled after dump_registers(), and it might be worth abstracting this in keyhandler.c. Yes, but this sounds like a new feature or framework rework rather than a fix patch. In a way, sure. It's a more extensive fix, which would avoid someone else running into the same issue with another handler. This seems a big change and a lot of dump function needs to rework, right? In any case quite likely the other patch of yours (which the one here basically modifies) may then also want to be reverted. I think patch "timer: process softirq during dumping timer" does right thing. The issue is triggered by previous patch. Well - the issue did not exist prior to both of your patches going in, and I think it would have continued to exist if the keyhandler rework patch alone had been reverted. (And I'm afraid anyway that "previous" is ambiguous here, as the timer handler change went in first.) Jan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V2
On 10/26/2016 5:36 PM, Jan Beulich wrote: On 18.10.16 at 16:14, wrote: 1.1 Enable more than 255 vcpu support HPC cloud service requires VM provides high performance parallel computing and we hope to create a huge VM with >255 vcpu on one machine to meet such requirement.Ping each vcpus on separated pcpus. More than 255 vcpus support requires X2APIC and Linux disables X2APIC mode if there is no interrupt remapping function which is present by vIOMMU. Interrupt remapping function helps to deliver interrupt to #vcpu >255. So we need to add vIOMMU before enabling >255 vcpus. I continue to dislike this completely neglecting that we can't even have >128 vCPU-s at present. Once again - there's other work to be done prior to lack of vIOMMU becoming the limiting factor. Yes, we can increase vcpu from 128 to 255 first without vIOMMU support. We have some draft patches to enable this. Andrew also will rework CPUID policy and change the rule of allocating vcpu's APIC ID. So we will base on it to increase vcpu number. VLAPIC also needs to be changed to support >255 APIC ID. These jobs can be implemented parallel with vIOMMU. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V2
On 10/26/2016 5:39 PM, Jan Beulich wrote: On 22.10.16 at 09:32, wrote: On 10/21/2016 4:36 AM, Andrew Cooper wrote: 3.5 Implementation consideration VT-d spec doesn't define a capability bit for the l2 translation. Architecturally there is no way to tell guest that l2 translation capability is not available. Linux Intel IOMMU driver thinks l2 translation is always available when VTD exits and fail to be loaded without l2 translation support even if interrupt remapping and l1 translation are available. So it needs to enable l2 translation first before other functions. What then is the purpose of the nested translation support bit in the extended capability register? It's to translate output GPA from first level translation(IOVA->GPA) to HPA. Detail please see VTD spec - 3.8 Nested Translation "When Nesting Enable (NESTE) field is 1 in extended-context-entries, requests-with-PASID translated through first-level translation are also subjected to nested second-level translation. Such extendedcontext- entries contain both the pointer to the PASID-table (which contains the pointer to the firstlevel translation structures), and the pointer to the second-level translation structures." I didn't phrase my question very well. I understand what the nested translation bit means, but I don't understand why we have a problem signalling the presence or lack of nested translations to the guest. In other words, why can't we hide l2 translation from the guest by simply clearing the nested translation capability? You mean to tell no support of l2 translation via nest translation bit? But the nested translation is a different function with l2 translation even from guest view and nested translation only works requests with PASID (l1 translation). Linux intel iommu driver enables l2 translation unconditionally and free iommu instance when failed to enable l2 translation. In which cases the wording of your description is confusing: Instead of "Linux Intel IOMMU driver thinks l2 translation is always available when VTD exits and fail to be loaded without l2 translation support ..." how about using something closer to what you've replied with last? Sorry for my pool English. Will update. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V2
On 2016年10月21日 04:36, Andrew Cooper wrote: >> >>> u64 iova; >>> /* Out parameters. */ >>> u64 translated_addr; >>> u64 addr_mask; /* Translation page size */ >>> IOMMUAccessFlags permisson; >> >> How is this translation intended to be used? How do you plan to avoid >> race conditions where qemu requests a translation, receives one, the >> guest invalidated the mapping, and then qemu tries to use its translated >> address? >> >> There are only two ways I can see of doing this race-free. One is to >> implement a "memcpy with translation" hypercall, and the other is to >> require the use of ATS in the vIOMMU, where the guest OS is required to >> wait for a positive response from the vIOMMU before it can safely reuse >> the mapping. >> >> The former behaves like real hardware in that an intermediate entity >> performs the translation without interacting with the DMA source. The >> latter explicitly exposing the fact that caching is going on at the >> endpoint to the OS. > > The former one seems to move DMA operation into hypervisor but Qemu > vIOMMU framework just passes IOVA to dummy xen-vIOMMU without input > data and access length. I will dig more to figure out solution. Yes - that does in principle actually move the DMA out of Qemu. Hi Adnrew: The first solution "Move the DMA out of Qemu": Qemu vIOMMU framework just give a chance of doing DMA translation to dummy xen-vIOMMU device model and DMA access operation is in the vIOMMU core code. It's hard to move this out. There are a lot of places to call translation callback and some these are not for DMA access(E,G Map guest memory in Qemu). The second solution "Use ATS to sync invalidation operation.": This requires to enable ATS for all virtual PCI devices. This is not easy to do. The following is my proposal: When IOMMU driver invalidates IOTLB, it also will wait until the invalidation completion. We may use this to drain in-fly DMA operation. Guest triggers invalidation operation and trip into vIOMMU in hypervisor to flush cache data. After this, it should go to Qemu to drain in-fly DMA translation. To do that, dummy vIOMMU in Qemu registers the same MMIO region as vIOMMU's and emulation part of invalidation operation returns X86EMUL_UNHANDLEABLE after flush cache. MMIO emulation part is supposed to send event to Qemu and dummy vIOMMU get a chance to starts a thread to drain in-fly DMA and return emulation done. Guest polls IVT(invalidate IOTLB) bit in the IOTLB invalidate register until it's cleared. Dummy vIOMMU notifies vIOMMU drain operation completed via hypercall, vIOMMU clears IVT bit and guest finish invalidation operation. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V2
On 10/26/2016 5:39 PM, Jan Beulich wrote: On 22.10.16 at 09:32, wrote: On 10/21/2016 4:36 AM, Andrew Cooper wrote: 3.5 Implementation consideration VT-d spec doesn't define a capability bit for the l2 translation. Architecturally there is no way to tell guest that l2 translation capability is not available. Linux Intel IOMMU driver thinks l2 translation is always available when VTD exits and fail to be loaded without l2 translation support even if interrupt remapping and l1 translation are available. So it needs to enable l2 translation first before other functions. What then is the purpose of the nested translation support bit in the extended capability register? It's to translate output GPA from first level translation(IOVA->GPA) to HPA. Detail please see VTD spec - 3.8 Nested Translation "When Nesting Enable (NESTE) field is 1 in extended-context-entries, requests-with-PASID translated through first-level translation are also subjected to nested second-level translation. Such extendedcontext- entries contain both the pointer to the PASID-table (which contains the pointer to the firstlevel translation structures), and the pointer to the second-level translation structures." I didn't phrase my question very well. I understand what the nested translation bit means, but I don't understand why we have a problem signalling the presence or lack of nested translations to the guest. In other words, why can't we hide l2 translation from the guest by simply clearing the nested translation capability? You mean to tell no support of l2 translation via nest translation bit? But the nested translation is a different function with l2 translation even from guest view and nested translation only works requests with PASID (l1 translation). Linux intel iommu driver enables l2 translation unconditionally and free iommu instance when failed to enable l2 translation. In which cases the wording of your description is confusing: Instead of "Linux Intel IOMMU driver thinks l2 translation is always available when VTD exits and fail to be loaded without l2 translation support ..." how about using something closer to what you've replied with last? Jan Hi All: I have some updates about implementation dependency between l2 translation(DMA translation) and irq remapping. I find there are a kernel parameter "intel_iommu=on" and kconfig option CONFIG_INTEL_IOMMU_DEFAULT_ON which control DMA translation function. When they aren't set, DMA translation function will not be enabled by IOMMU driver even if some vIOMMU registers show L2 translation function available. In the meantime, irq remapping function still can work to support >255 vcpus. I check distribution RHEL, SLES, Oracle and ubuntu don't set the kernel parameter or select the kconfig option. So we can emulate irq remapping fist with some capability bits(e,g SAGAW of Capability Register) of l2 translation for >255 vcpus support without l2 translation emulation. Showing l2 capability bits is to make sure IOMMU driver probe ACPI DMAR tables successfully because IOMMU driver access these bits during reading ACPI tables. If someone add "intel_iommu=on" kernel parameter manually, IOMMU driver will panic guest because it can't enable DMA remapping function via gcmd register and "Translation Enable Status" bit in gsts register is never set by vIOMMU. This shows actual vIOMMU status of no l2 translation emulation and warn user should not enable l2 translation. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] Xen virtual IOMMU high level design doc V3
Change since V2: 1) Update motivation for Xen vIOMMU - 288 vcpus support part Add descriptor about plan of increasing vcpu from 128 to 255 and dependency between X2APIC and interrupt remapping. 2) Update 3.1 New vIOMMU hypercall interface Change vIOMMU hypercall from sysctl to dmop, add multi vIOMMU consideration consideration and drain in-fly DMA subcommand 3) Update 3.5 implementation consideration We found it's still safe to enable interrupt remapping function before adding l2 translation(DMA translation) to increase vcpu number >255. 4) Update 3.2 l2 translation - virtual device part Add proposal to deal with race between in-fly DMA and invalidation operation in hypervisor. 5) Update 4.4 Report vIOMMU to hvmloader Add option of building ACPI DMAR table in the toolstack for discussion. Change since V1: 1) Update motivation for Xen vIOMMU - 288 vcpus support part 2) Change definition of struct xen_sysctl_viommu_op 3) Update "3.5 Implementation consideration" to explain why we needs to enable l2 translation first. 4) Update "4.3 Q35 vs I440x" - Linux/Windows VTD drivers can work on the emulated I440 chipset. 5) Remove stale statement in the "3.3 Interrupt remapping" Content: === 1. Motivation of vIOMMU 1.1 Enable more than 255 vcpus 1.2 Support VFIO-based user space driver 1.3 Support guest Shared Virtual Memory (SVM) 2. Xen vIOMMU Architecture 2.1 l2 translation overview 2.2 Interrupt remapping overview 3. Xen hypervisor 3.1 New vIOMMU hypercall interface 3.2 l2 translation 3.3 Interrupt remapping 3.4 l1 translation 3.5 Implementation consideration 4. Qemu 4.1 Qemu vIOMMU framework 4.2 Dummy xen-vIOMMU driver 4.3 Q35 vs. i440x 4.4 Report vIOMMU to hvmloader Glossary: l1 translation - first-level translation to remap a virtual address to intermediate (guest) physical address. (GVA->GPA) l2 translation - second-level translations to remap a intermediate physical address to machine (host) physical address. (GPA->HPA) 1 Motivation for Xen vIOMMU 1.1 Enable more than 255 vcpu support HPC cloud service requires VM provides high performance parallel computing and we hope to create a huge VM with >255 vcpu on one machine to meet such requirement. Pin each vcpu to separate pcpus. Now HVM guest can support 128 vcpus at most. We can increase vcpu number from 128 to 255 via changing some limitations and extending vcpu related data structure. This also needs to change the rule of allocating vcpu's APIC ID. Current rule is "(APIC ID) = (vcpu index) * 2". We need to change it to "(APIC ID) = (vcpu index)". Andrew Cooper's CPUID improvement work will cover this to improve guest's cpu topology. We will base on this to increase vcpu number from 128 to 255. To support >255 vcpus, X2APIC mode in guest is necessary because legacy APIC(XAPIC) just supports 8-bit APIC ID and it only can support 255 vcpus at most. X2APIC mode supports 32-bit APIC ID and it requires interrupt mapping function of vIOMMU. The reason for this is that there is no modification to existing PCI MSI and IOAPIC with the introduction of X2APIC. PCI MSI/IOAPIC can only send interrupt message containing 8-bit APIC ID, which cannot address >255 cpus. Interrupt remapping supports 32-bit APIC ID and so it's necessary to enable >255 cpus with x2apic mode. Both Linux and Windows requires interrupt remapping when cpu number is >255. 1.2 Support VFIO-based user space driver (e.g. DPDK) in the guest It relies on the l2 translation capability (IOVA->GPA) on vIOMMU. pIOMMU l2 becomes a shadowing structure of vIOMMU to isolate DMA requests initiated by user space driver. 1.3 Support guest SVM (Shared Virtual Memory) It relies on the l1 translation table capability (GVA->GPA) on vIOMMU. pIOMMU needs to enable both l1 and l2 translation in nested mode (GVA->GPA->HPA) for passthrough device. IGD passthrough is the main usage today (to support OpenCL 2.0 SVM feature). In the future SVM might be used by other I/O devices too. 2. Xen vIOMMU Architecture * vIOMMU will be inside Xen hypervisor for following factors 1) Avoid round trips between Qemu and Xen hypervisor 2) Ease of integration with the rest of the hypervisor 3) HVMlite/PVH doesn't use Qemu * Dummy xen-vIOMMU in Qemu as a wrapper of new hypercall to create /destroy vIOMMU in hypervisor and deal with virtual PCI device's l2 translation. 2.1 l2 translation overview For Virtual PCI device, dummy xen-vIOMMU does translation in the Qemu via new
Re: [Xen-devel] Xen virtual IOMMU high level design doc V3
On 11/19/2016 3:43 AM, Julien Grall wrote: Hi Lan, On 17/11/2016 09:36, Lan Tianyu wrote: 1) Definition of "struct xen_dmop_viommu_op" as new hypercall parameter. struct xen_dmop_viommu_op { u32 cmd; u32 domid; u32 viommu_id; union { struct { u32 capabilities; } query_capabilities; struct { /* IN parameters. */ u32 capabilities; u64 base_address; struct { u32 size; XEN_GUEST_HANDLE_64(uint32) dev_list; } dev_scope; /* Out parameters. */ u32 viommu_id; } create_iommu; struct { /* IN parameters. */ u32 vsbdf; I only gave a quick look through this design document. The new hypercalls looks arch/device agnostic except this part. Having a virtual IOMMU on Xen ARM is something we might consider in the future. In the case of ARM, a device can either be a PCI device or integrated device. The latter does not have a sbdf. The IOMMU will usually be configured with a stream ID (SID) that can be deduced from the sbdf and hardcoded for integrated device. So I would rather not tie the interface to PCI and use a more generic name for this field. Maybe vdevid, which then can be architecture specific. Hi Julien: Thanks for your input. This interface is just for virtual PCI device which is called by Qemu. I am not familiar with ARM. Are there any non-PCI emulated devices for arm in Qemu which need to be covered by vIOMMU? ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V3
On 2016年11月21日 21:41, Andrew Cooper wrote: > On 17/11/16 15:36, Lan Tianyu wrote: >> 3.2 l2 translation >> 1) For virtual PCI device >> Xen dummy xen-vIOMMU in Qemu translates IOVA to target GPA via new >> hypercall when DMA operation happens. >> >> When guest triggers a invalidation operation, there maybe in-fly DMA >> request for virtual device has been translated by vIOMMU and return back >> Qemu. Before vIOMMU tells invalidation completed, it's necessary to make >> sure in-fly DMA operation is completed. >> >> When IOMMU driver invalidates IOTLB, it also will wait until the >> invalidation completion. We may use this to drain in-fly DMA operation >> for virtual device. >> >> Guest triggers invalidation operation and trip into vIOMMU in >> hypervisor to flush cache data. After this, it should go to Qemu to >> drain in-fly DMA translation. >> >> To do that, dummy vIOMMU in Qemu registers the same MMIO region as >> vIOMMU's and emulation part of invalidation operation in Xen hypervisor >> returns X86EMUL_UNHANDLEABLE after flush cache. MMIO emulation part is >> supposed to send event to Qemu and dummy vIOMMU get a chance to starts a >> thread to drain in-fly DMA and return emulation done. >> >> Guest polls IVT(invalidate IOTLB) bit in the IOTLB invalidate register >> until it's cleared after triggering invalidation. Dummy vIOMMU in Qemu >> notifies hypervisor drain operation completed via hypercall, vIOMMU >> clears IVT bit and guest finish invalidation operation. > > Having the guest poll will be very inefficient. If the invalidation > does need to reach qemu, it will be a very long time until it > completes. Is there no interrupt based mechanism which can be used? > That way the guest can either handle it asynchronous itself, or block > waiting on an interrupt, both of which are better than having it just > spinning. > Hi Andrew: VTD provides interrupt event for Queue invalidation completion. So guest can select poll or interrupt mode to wait for invalidation completion. I found Linux Intel IOMMU driver just used poll mode and so used it for example. Regardless of poll and interrupt mode, guest will wait for invalidation completion and we just need to make sure to finish draining in-fly DMA before clearing invalidation completion bit. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V3
On 2016年11月21日 15:05, Tian, Kevin wrote: >> If someone add "intel_iommu=on" kernel parameter manually, IOMMU driver >> > will panic guest because it can't enable DMA remapping function via gcmd >> > register and "Translation Enable Status" bit in gsts register is never >> > set by vIOMMU. This shows actual vIOMMU status that there is no l2 >> > translation support and warn user should not enable l2 translation. > The rationale of section 3.5 is confusing. Do you mean sth. like below? > > - We can first do IRQ remapping, because DMA remapping (l1/l2) and > IRQ remapping can be enabled separately according to VT-d spec. Enabling > of DMA remapping will be first emulated as a failure, which may lead > to guest kernel panic if intel_iommu is turned on in the guest. But it's > not a big problem because major distributions have DMA remapping > disabled by default while IRQ remapping is enabled. > > - For DMA remapping, likely you'll enable L2 translation first (there is > no capability bit) with L1 translation disabled (there is a SVM capability > bit). > > If yes, maybe we can break this design into 3 parts too, so both > design review and implementation side can move forward step by > step? > Yes, we may implement IRQ remapping first. I will break this design into 3 parts(interrupt remapping, L2 translation and L1 translation). IRQ remapping will be first one to be sent out for detail discussion. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc V3
On 2016年11月22日 18:24, Jan Beulich wrote: On 17.11.16 at 16:36, wrote: >> 2) Build ACPI DMAR table in toolstack >> Now tool stack can boot ACPI DMAR table according VM configure and pass >> though it to hvmloader via xenstore ACPI PT channel. But the vIOMMU MMIO >> region is managed by Qemu and it's need to be populated into DMAR >> table. We may hardcore an address in both Qemu and toolstack and use the >> same address to create vIOMMU and build DMAR table. > > Let's try to avoid any new hard coding of values. Both tool stack > and qemu ought to be able to retrieve a suitable address range > from the hypervisor. Or if the tool stack was to allocate it, it could > tell qemu. > > Jan > Hi Jan: The address range is allocated by Qemu or toolstack and pass to hypervisor when create vIOMMU. The vIOMMU's address range should be under PCI address sapce and so we need to reserve a piece of PCI region for vIOMMU in the toolstack. Then, populate base address in the vDMAR table and tell Qemu the region via new xenstore interface if we want to create vIOMMU in the Qemu dummy hypercall wrapper. Another point, I am not sure whether we can create/destroy vIOMMU directly in toolstack because virtual device models usually are handled by Qemu. If yes, we don't need new Xenstore interface. In this case, the dummy vIOMMU in Qemu will just cover L2 translation for virtual device. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc
On 2016年11月24日 12:09, Edgar E. Iglesias wrote: Hi, > > > > > > I have a few questions. > > > > > > If I understand correctly, you'll be emulating an Intel IOMMU in Xen. > > > So guests will essentially create intel iommu style page-tables. > > > > > > If we were to use this on Xen/ARM, we would likely be modelling an > > > ARM > > > SMMU as a vIOMMU. Since Xen on ARM does not use QEMU for emulation, > > > the > > > hypervisor OPs for QEMUs xen dummy IOMMU queries would not really be > > > used. > > > Do I understand this correctly? >>> > > >>> > > I think they could be called from the toolstack. This is why I was >>> > > saying in the other thread that the hypercalls should be general enough >>> > > that QEMU is not the only caller. >>> > > >>> > > For PVH and ARM guests, the toolstack should be able to setup the vIOMMU >>> > > on behalf of the guest without QEMU intervention. > OK, I see. Or, I think I understand, not sure :-) > > In QEMU when someone changes mappings in an IOMMU there will be a notifier > to tell caches upstream that mappings have changed. I think we will need to > prepare for that. I.e when TCG CPUs sit behind an IOMMU. For Xen side, we may notify pIOMMU driver about mapping change via calling pIOMMU driver's API in vIOMMU. > > Another area that may need change is that on ARM we need the map-query to > return > the memory attributes for the given mapping. Today QEMU or any emulator > doesn't use it much but in the future things may change. > > For SVM, whe will also need to deal with page-table faults by the IOMMU. > So I think there will need to be a channel from Xen to Guesrt to report these. Yes, vIOMMU should forward the page-fault event to guest. For VTD side, we will trigger VTD's interrupt to notify guest about the event. > > For example, what happens when a guest assigned DMA unit page-faults? > Xen needs to know how to forward this fault back to guest for fixup and the > guest needs to be able to fix it and tell the device that it's OK to contine. > E.g PCI PRI or similar. > > -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Xen virtual IOMMU high level design doc
On 11/24/2016 9:37 PM, Edgar E. Iglesias wrote: On Thu, Nov 24, 2016 at 02:49:41PM +0800, Lan Tianyu wrote: On 2016年11月24日 12:09, Edgar E. Iglesias wrote: Hi, I have a few questions. If I understand correctly, you'll be emulating an Intel IOMMU in Xen. So guests will essentially create intel iommu style page-tables. If we were to use this on Xen/ARM, we would likely be modelling an ARM SMMU as a vIOMMU. Since Xen on ARM does not use QEMU for emulation, the hypervisor OPs for QEMUs xen dummy IOMMU queries would not really be used. Do I understand this correctly? I think they could be called from the toolstack. This is why I was saying in the other thread that the hypercalls should be general enough that QEMU is not the only caller. For PVH and ARM guests, the toolstack should be able to setup the vIOMMU on behalf of the guest without QEMU intervention. OK, I see. Or, I think I understand, not sure :-) In QEMU when someone changes mappings in an IOMMU there will be a notifier to tell caches upstream that mappings have changed. I think we will need to prepare for that. I.e when TCG CPUs sit behind an IOMMU. For Xen side, we may notify pIOMMU driver about mapping change via calling pIOMMU driver's API in vIOMMU. I was refering to the other way around. When a guest modifies the mappings for a vIOMMU, the driver domain with QEMU and vDevices needs to be notified. I couldn't find any mention of this in the document... Qemu side won't have iotlb cache and all DMA translation info are in the hypervisor. All vDevice's DMA requests are passed to hypervisor, hypervisor returns back translated address and then Qemu finish the DMA operation finally. There is a race condition between iotlb invalidation operation and vDevices' in-fly DMA. We proposed a solution in "3.2 l2 translation - For virtual PCI device". We hope to take advantage of current ioreq mechanism to achieve something like notifier. Both vIOMMU in hypervisor and dummy vIOMMU in Qemu register the same MMIO region. When there is a invalidation MMIO access and hypervisor want to notify Qemu, vIOMMU's MMIO handler returns X86EMUL_UNHANDLEABLE and io emulation handler is supposed to send IO request to Qemu. Dummy vIOMMU in Qemu receives the event and start to drain in-fly DMA operation. Another area that may need change is that on ARM we need the map-query to return the memory attributes for the given mapping. Today QEMU or any emulator doesn't use it much but in the future things may change. What about the mem attributes? It's very likely we'll add support for memory attributes for IOMMU's in QEMU at some point. Emulated IOMMU's will thus have the ability to modify attributes (i.e SourceID's, cacheability, etc). Perhaps we could allocate or reserve an uint64_t for attributes TBD later in the query struct. Sounds like you hope to extend capability variable in the query struct to uint64_t to support more future feature, right? I have added "permission" variable in struct l2_translation to return vIOMMU's memory access permission for vDevice's DMA request. No sure it can meet your requirement. For SVM, whe will also need to deal with page-table faults by the IOMMU. So I think there will need to be a channel from Xen to Guesrt to report these. Yes, vIOMMU should forward the page-fault event to guest. For VTD side, we will trigger VTD's interrupt to notify guest about the event. OK, Cool. Perhaps you should document how this (and the map/unmap notifiers) will work? This is VTD specific to deal with some fault events and just like some other virtual device models emulate its interrupt. So I didn't put this in this design document. For mapping change, please see the fist comments. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
On 6/8/2016 4:11 PM, Tian, Kevin wrote: It makes sense... I thought you used this security issue against placing vIOMMU in Qemu, which made me a bit confused earlier. :-) We are still thinking feasibility of some staging plan, e.g. first implementing some vIOMMU features w/o dependency on root-complex in Xen (HVM only) and then later enabling full vIOMMU feature w/ root-complex in Xen (covering HVMLite). If we can reuse most code between two stages while shorten time-to-market by half (e.g. from 2yr to 1yr), it's still worthy of pursuing. will report back soon once the idea is consolidated... Thanks Kevin After discussion with Kevin, we draft a staging plan of implementing vIOMMU in Xen based on Qemu host bridge. Both virtual devices and passthough devices use one vIOMMU in Xen. Your comments are very appreciated. 1. Enable Q35 support in the hvmloader. In the real world, VTD support starts from Q35 and OS may have such assumption that VTD only exists on the Q35 or newer platform. Q35 support seems necessary for vIOMMU support. In regardless of Q35 host bridge in the Qemu or Xen hypervisor, hvmloader needs to be compatible with Q35 and build Q35 ACPI tables. Qemu already has Q35 emulation and so the hvmloader job can start with Qemu. When host bridge in Xen is ready, these changes also can be reused. 2. Implement vIOMMU in Xen based on Qemu host bridge. Add a new device type "Xen iommu" in the Qemu as a wrapper of vIOMMU hypercalls to communicate with Xen vIOMMU. It's in charge of: 1) Query vIOMMU capability(E,G interrupt remapping, DMA translation, SVM and so on) 2) Create vIOMMU with predefined base address of IOMMU unit regs 3) Notify hvmloader to populate related content in the ACPI DMAR table.(Add vIOMMU info to struct hvm_info_table) 4) Deal with DMA translation request of virtual devices and return back translated address. 5) Attach/detach hotplug device from vIOMMU New hypercalls for vIOMMU that are also necessary when host bridge in Xen. 1) Query vIOMMU capability 2) Create vIOMMU(IOMMU unit reg base as params) 3) Virtual device's DMA translation 4) Attach/detach hotplug device from VIOMMU All IOMMU emulations will be done in Xen 1) DMA translation 2) Interrupt remapping 3) Shared Virtual Memory (SVM) ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
Hi Stefano, Andrew and Jan: Could you give us more guides here to move forward virtual iommu development? Thanks. On 6/29/2016 11:04 AM, Tian, Kevin wrote: From: Lan, Tianyu Sent: Sunday, June 26, 2016 9:43 PM On 6/8/2016 4:11 PM, Tian, Kevin wrote: It makes sense... I thought you used this security issue against placing vIOMMU in Qemu, which made me a bit confused earlier. :-) We are still thinking feasibility of some staging plan, e.g. first implementing some vIOMMU features w/o dependency on root-complex in Xen (HVM only) and then later enabling full vIOMMU feature w/ root-complex in Xen (covering HVMLite). If we can reuse most code between two stages while shorten time-to-market by half (e.g. from 2yr to 1yr), it's still worthy of pursuing. will report back soon once the idea is consolidated... Thanks Kevin After discussion with Kevin, we draft a staging plan of implementing vIOMMU in Xen based on Qemu host bridge. Both virtual devices and passthough devices use one vIOMMU in Xen. Your comments are very appreciated. The rationale here is to separate BIOS structures from actual vIOMMU emulation. vIOMMU will be always emulated in Xen hypervisor, regardless of where Q35 emulation is done or whether it's HVM or HVMLite. The staging plan is more for the BIOS structure reporting which is Q35 specific. For now we first target Qemu Q35 emulation, with a set of vIOMMU ops introduced as Tianyu listed below to help interact between Qemu and Xen. Later when Xen Q35 emulation is ready, the reporting can be done in Xen. The main limitation of this model is on DMA emulation of Qemu virtual devices, which needs to query Xen vIOMMU for every virtual DMA. It is possibly fine for virtual devices which are normally not for performance critical usages. Also there may be some chance to cache some translations within Qemu like thru ATS (may not worthy of it though...). 1. Enable Q35 support in the hvmloader. In the real world, VTD support starts from Q35 and OS may have such assumption that VTD only exists on the Q35 or newer platform. Q35 support seems necessary for vIOMMU support. In regardless of Q35 host bridge in the Qemu or Xen hypervisor, hvmloader needs to be compatible with Q35 and build Q35 ACPI tables. Qemu already has Q35 emulation and so the hvmloader job can start with Qemu. When host bridge in Xen is ready, these changes also can be reused. 2. Implement vIOMMU in Xen based on Qemu host bridge. Add a new device type "Xen iommu" in the Qemu as a wrapper of vIOMMU hypercalls to communicate with Xen vIOMMU. It's in charge of: 1) Query vIOMMU capability(E,G interrupt remapping, DMA translation, SVM and so on) 2) Create vIOMMU with predefined base address of IOMMU unit regs 3) Notify hvmloader to populate related content in the ACPI DMAR table.(Add vIOMMU info to struct hvm_info_table) 4) Deal with DMA translation request of virtual devices and return back translated address. 5) Attach/detach hotplug device from vIOMMU New hypercalls for vIOMMU that are also necessary when host bridge in Xen. 1) Query vIOMMU capability 2) Create vIOMMU(IOMMU unit reg base as params) 3) Virtual device's DMA translation 4) Attach/detach hotplug device from VIOMMU We don't need 4). Hotplug device is automatically handled by the vIOMMU with INCLUDE_ALL flag set (which should be the case if we only have one vIOMMU in Xen). We don't need further notify this event to Xen vIOMMU. And once we have Xen Q35 emulation in place, possibly only 3) is required then. All IOMMU emulations will be done in Xen 1) DMA translation 2) Interrupt remapping 3) Shared Virtual Memory (SVM) Please let us know your thoughts. If no one has explicit objection based on above rough idea, we'll go to write the high level design doc for more detail discussion. Thanks Kevin ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] Discussion about virtual iommu support for Xen guest
On 7/5/2016 9:57 PM, Jan Beulich wrote: On 05.07.16 at 15:37, wrote: Hi Stefano, Andrew and Jan: Could you give us more guides here to move forward virtual iommu development? Thanks. Due to ... On 6/29/2016 11:04 AM, Tian, Kevin wrote: Please let us know your thoughts. If no one has explicit objection based on above rough idea, we'll go to write the high level design doc for more detail discussion. ... this I actually expected we'd get to see something, rather than our input being waited for. OK. I get it. Because no response, double confirm we are on the right way. ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 0/4] Qemu: Add Xen vIOMMU support
On 2017年03月20日 19:38, Paolo Bonzini wrote: > Fair enough, though I'd be worried about increasing the attack surface > of the hypervisor. For KVM, for example, IOMMU emulation requires using > the "split irqchip" feature to move the PIC and IOAPIC out of the kernel > and back to QEMU. Yes, just like Roger mentioned we also need to support no-qemu mode on Xen and this is tradeoff result. > > Also, I think this series is missing changes to support IOMMU > translation in the vIOMMU device model. Yes, this series just enabled vIOMMU's irq remapping function and we need to pass virtual device's DMA request to Xen hypervisor for translation when enable DMA translation. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 00/23] xen/vIOMMU: Add vIOMMU support with irq remapping fucntion on Intel platform
On 2017年03月20日 22:23, Roger Pau Monné wrote: > Thanks! So you add all this vIOMMU code, but the maximum number of allowed > vCPUs for HVM guests is still limited to 128 (HVM_MAX_VCPUS is not touched). > Is > there any missing pieces in order to bump this? To increase vcpu number, we need to change APIC ID rule and now it's APICID = VCPUID * 2. Andrew's CPUID improvement will change it and so our following patches of increasing vcpu number will base on Andrew's job. > > Also, have you tested if this series works with PVH guests? Boris added PVH > support to Linux not long ago, so you should be able to test it just by > picking > the latest Linux kernel. Our patchset just targets hvm guest and it will not work for PV guest. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 00/23] xen/vIOMMU: Add vIOMMU support with irq remapping fucntion on Intel platform
On 2017年03月21日 10:28, Lan Tianyu wrote: > On 2017年03月20日 22:23, Roger Pau Monné wrote: >> Thanks! So you add all this vIOMMU code, but the maximum number of allowed >> vCPUs for HVM guests is still limited to 128 (HVM_MAX_VCPUS is not touched). >> Is >> there any missing pieces in order to bump this? > > To increase vcpu number, we need to change APIC ID rule and now it's > APICID = VCPUID * 2. Andrew's CPUID improvement will change it and so > our following patches of increasing vcpu number will base on Andrew's job. > > >> >> Also, have you tested if this series works with PVH guests? Boris added PVH >> support to Linux not long ago, so you should be able to test it just by >> picking >> the latest Linux kernel. > > Our patchset just targets hvm guest and it will not work for PV guest. New hypercalls introduced by this patchset also can reuse for PVH to enable vIOMMU. This patchset relies on Qemu Xen-vIOMMU device model to create/destroy vIOMMU. If we want to enable DMA translation for hvm guest later, virtual device's DMA request would be passed from Qemu to Xen hypervisor and the device model in Qemu is necessary. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 1/23] VIOMMU: Add vIOMMU helper functions to create, destroy and query capabilities
Hi Julien: Thanks for review. On 2017年03月22日 03:56, Julien Grall wrote: > === > > diff --git a/xen/include/public/viommu.h b/xen/include/public/viommu.h > new file mode 100644 > index 000..ca2419b > > --- /dev/null > > +++ b/xen/include/public/viommu.h > > @@ -0,0 +1,9 @@ > > +/* > +·*·include/public/viommu.h > +·* > +·*·Copyright·(c)·2017·Intel·Corporation > +·*·Author:·Lan·Tianyu· > +·* > +·*·This·program·is·free·software;·you·can·redistribute·it·and/or·modify·it > +·*·under·the·terms·and·conditions·of·the·GNU·General·Public·License, > +·*·version·2,·as·published·by·the·Free·Software·Foundation. > > obj-y += vmap.o > obj-y += vsprintf.o > obj-y += wait.o > +obj-y += viommu.o > I see very little point to enable viommu by default on all architecture. > This is x86 specific and I am yet sure how we would be able to use it on > ARM as the current series rely on QEMU. Also this is waste space in > struct domain. XEN_DMOP_create/destroy_viommu hypercalls we introduced are generic for all platforms and can use in toolstack to create/destroy vIOMMU rather than just in Qemu. This takes PVH case into account which also don't use Qemu. > I would prefer if you introduce a Kconfig that would be select by x86 only. > Regards, > Public headers sould not be GPLv2 otherwise it will cause some trouble > for non-GPLv2 OS. See the license in xen/include/public/COPYING. Yes, it should be MIT license. > > Regards. > > -- Julien Grall -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 1/23] VIOMMU: Add vIOMMU helper functions to create, destroy and query capabilities
On 3/22/2017 4:36 PM, Tian, Kevin wrote: From: Julien Grall [mailto:julien.gr...@arm.com] Sent: Wednesday, March 22, 2017 3:57 AM diff --git a/xen/common/Makefile b/xen/common/Makefile index 0fed30b..b58de63 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -60,6 +60,7 @@ obj-y += vm_event.o obj-y += vmap.o obj-y += vsprintf.o obj-y += wait.o +obj-y += viommu.o I see very little point to enable viommu by default on all architecture. This is x86 specific and I am yet sure how we would be able to use it on ARM as the current series rely on QEMU. Also this is waste space in struct domain. I would prefer if you introduce a Kconfig that would be select by x86 only. Regards, Also viommu.c is too generic. Each vendor should has his own implementation. better change to vvtd.c (and make more sense move to hvm) Hi Kevin: vIommu is an abstract layer and we have added vvtd.c under hvm directory in the following patch. vvtd will register its callbacks to vIOMMU layer. This works just like IOMMU core and VTD driver. Thanks Kevin ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 1/23] VIOMMU: Add vIOMMU helper functions to create, destroy and query capabilities
On 3/22/2017 7:40 PM, Julien Grall wrote: Hello, On 22/03/17 08:45, Lan Tianyu wrote: Hi Julien: Thanks for review. On 2017年03月22日 03:56, Julien Grall wrote: === diff --git a/xen/include/public/viommu.h b/xen/include/public/viommu.h new file mode 100644 index 000..ca2419b --- /dev/null +++ b/xen/include/public/viommu.h @@ -0,0 +1,9 @@ +/* +·*·include/public/viommu.h +·* +·*·Copyright·(c)·2017·Intel·Corporation +·*·Author:·Lan·Tianyu· +·* +·*·This·program·is·free·software;·you·can·redistribute·it·and/or·modify·it +·*·under·the·terms·and·conditions·of·the·GNU·General·Public·License, +·*·version·2,·as·published·by·the·Free·Software·Foundation. obj-y += vmap.o obj-y += vsprintf.o obj-y += wait.o +obj-y += viommu.o I see very little point to enable viommu by default on all architecture. This is x86 specific and I am yet sure how we would be able to use it on ARM as the current series rely on QEMU. Also this is waste space in struct domain. XEN_DMOP_create/destroy_viommu hypercalls we introduced are generic for all platforms and can use in toolstack to create/destroy vIOMMU rather than just in Qemu. This takes PVH case into account which also don't use Qemu. I am afraid that none of the DMOP you suggested in this series will fit for ARM. For instance it is not possible to select via DMOP_CREATE the kind of vIOMMU (e.g SMMUv2, SMMUv3, IPMMU-VMSA...). Thanks for your information. I am not sure whether we can introduce arch specific hypercalls for different vIOMMU implementations and So try to make it more general. To support more type vIOMMUs or more vIOMMU subfeature, we may extend input parameter structure. To be clear, I am not asking to get this code ready for ARM, but at least we need to make sure the API could be easily extended. During the discussion on the design documented it was suggested to add a iommu_version field to make it "future proof". Sure. That's very good suggestion. Sorry, I missed that in this series. and thought "capability" field in struct xen_dm_op_create_viommu is enough for other vendors to extend more sub features. Will change it. Also, I was not asking to move this code in arch/x86 but not compiling the code on ARM by default as it is currently unusable. Sure. Will change it. Regards, ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu operations in libxc
Hi Paul: Sorry for later response. On 3/31/2017 3:57 AM, Chao Gao wrote: On Wed, Mar 29, 2017 at 09:08:06AM +, Paul Durrant wrote: -Original Message- From: Xen-devel [mailto:xen-devel-boun...@lists.xen.org] On Behalf Of Chao Gao Sent: 29 March 2017 01:40 To: Wei Liu Cc: Lan Tianyu ; Kevin Tian ; Ian Jackson ; xen-devel@lists.xen.org Subject: Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu operations in libxc Tianyu is on vacation this two weeks, so I will try to address some comments on this series. On Tue, Mar 28, 2017 at 05:24:03PM +0100, Wei Liu wrote: On Fri, Mar 17, 2017 at 07:27:05PM +0800, Lan Tianyu wrote: From: Chao Gao In previous patch, we introduce a common vIOMMU layer. In our design, we create/destroy vIOMMU through DMOP interface instead of creating it according to a config flag of domain. It makes it is possible to create vIOMMU in device model or in tool stack. I've not been following this closely so apologies if this has already been asked... Why would you need to create a vIOMMU instance in an external device model. Since the toolstack should be in control of the device model configuration why would it not know in advance that one was required? I assume your question is why we don't create a vIOMMU instance via hypercall in toolstack. I think creating in toolstack is also ok and is easier to be reused by pvh. If Tianyu has no concern about this, will move this part to toolstack. We can move create/destroy vIOMMU in the tool stack but we still need to add such dummy vIOMMU device model in Qemu to pass virtual device's DMA request into Xen hypervisor. Qemu is required to use DMOP hypercall and tool stack may use domctl hyercall. vIOMMU hypercalls will be divided into two part. Domctl: create, destroy and query. DMOP: vDev's DMA related operations. Is this OK? Thanks, Chao Paul The following toolstack code is to add XEN_DMOP_viommu_XXX syscalls: Hypercalls, not syscalls. - query capabilities of vIOMMU emulated by Xen - create vIOMMU in Xen hypervisor with base address, capability - destroy vIOMMU specified by viommu_id Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- tools/libs/devicemodel/core.c | 69 + tools/libs/devicemodel/include/xendevicemodel.h | 35 + tools/libs/devicemodel/libxendevicemodel.map| 3 ++ tools/libxc/include/xenctrl_compat.h| 5 ++ tools/libxc/xc_devicemodel_compat.c | 18 +++ 5 files changed, 130 insertions(+) diff --git a/tools/libs/devicemodel/core.c b/tools/libs/devicemodel/core.c index a85cb49..aee1150 100644 --- a/tools/libs/devicemodel/core.c +++ b/tools/libs/devicemodel/core.c Bear in mind that this library is stable, so whatever ends up here can change in the future. This is not saying the following code is problematic. It is just a general FYI. Obviously the toolstack side is going to follow the hypervisor interface, so I will do a detailed review later. Sure. If the hypervisor interface settles down, we can inform you. +int xendevicemodel_viommu_destroy( +xendevicemodel_handle *dmod, domid_t dom, uint32_t viommu_id); #endif /* __XEN_TOOLS__ */ #endif /* XENDEVICEMODEL_H */ diff --git a/tools/libs/devicemodel/libxendevicemodel.map b/tools/libs/devicemodel/libxendevicemodel.map index 45c773e..c2e0968 100644 --- a/tools/libs/devicemodel/libxendevicemodel.map +++ b/tools/libs/devicemodel/libxendevicemodel.map @@ -17,6 +17,9 @@ VERS_1.0 { xendevicemodel_modified_memory; xendevicemodel_set_mem_type; xendevicemodel_inject_event; + xendevicemodel_viommu_query_cap; + xendevicemodel_viommu_create; + xendevicemodel_viommu_destroy; xendevicemodel_restrict; xendevicemodel_close; I suppose this series is going to miss 4.9. Please add these functions to VERS_1.1. Yes. We will fix this. local: *; /* Do not expose anything by default */ diff --git a/tools/libxc/include/xenctrl_compat.h b/tools/libxc/include/xenctrl_compat.h index 040e7b2..315c45d 100644 --- a/tools/libxc/include/xenctrl_compat.h +++ b/tools/libxc/include/xenctrl_compat.h @@ -164,6 +164,11 @@ int xc_hvm_set_mem_type( int xc_hvm_inject_trap( xc_interface *xch, domid_t domid, int vcpu, uint8_t vector, uint8_t type, uint32_t error_code, uint8_t insn_len, uint64_t cr2); +int xc_viommu_query_cap(xc_interface *xch, domid_t dom, uint64_t *cap); +int xc_viommu_create( +xc_interface *xch, domid_t dom, uint64_t base_addr, uint64_t cap, +uint32_t *viommu_id); +int xc_viommu_destroy(xc_interface *xch, domid_t dom, uint32_t viommu_id); #endif /* XC_WANT_COMPAT_DEVICEMODEL_API */ diff --git a/tools/libxc/xc_devicemodel_compat.c b/tools/libxc/xc_devicemodel_compat.c index e4edeea..62f703a 100644 --- a/tools/libxc/xc_devicemodel_
Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu operations in libxc
On 2017年04月17日 19:08, Wei Liu wrote: > On Fri, Apr 14, 2017 at 11:38:15PM +0800, Lan, Tianyu wrote: >> Hi Paul: >> Sorry for later response. >> >> On 3/31/2017 3:57 AM, Chao Gao wrote: >>> On Wed, Mar 29, 2017 at 09:08:06AM +, Paul Durrant wrote: >>>>> -Original Message- >>>>> From: Xen-devel [mailto:xen-devel-boun...@lists.xen.org] On Behalf Of >>>>> Chao Gao >>>>> Sent: 29 March 2017 01:40 >>>>> To: Wei Liu >>>>> Cc: Lan Tianyu ; Kevin Tian ; >>>>> Ian Jackson ; xen-devel@lists.xen.org >>>>> Subject: Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu >>>>> operations in libxc >>>>> >>>>> Tianyu is on vacation this two weeks, so I will try to address >>>>> some comments on this series. >>>>> >>>>> On Tue, Mar 28, 2017 at 05:24:03PM +0100, Wei Liu wrote: >>>>>> On Fri, Mar 17, 2017 at 07:27:05PM +0800, Lan Tianyu wrote: >>>>>>> From: Chao Gao >>>>>>> >>>>>>> In previous patch, we introduce a common vIOMMU layer. In our design, >>>>>>> we create/destroy vIOMMU through DMOP interface instead of creating >>>>> it >>>>>>> according to a config flag of domain. It makes it is possible >>>>>>> to create vIOMMU in device model or in tool stack. >>>>>>> >>>> >>>> I've not been following this closely so apologies if this has already been >>>> asked... >>>> >>>> Why would you need to create a vIOMMU instance in an external device model. >>>> Since the toolstack should be in control of the device model configuration >>>> why would it not know in advance that one was required? >>> >>> I assume your question is why we don't create a vIOMMU instance via >>> hypercall in toolstack. >>> I think creating in toolstack is also ok and is easier to be reused by pvh. >>> >>> If Tianyu has no concern about this, will move this part to toolstack. >> >> We can move create/destroy vIOMMU in the tool stack but we still need to add >> such dummy vIOMMU device model in Qemu to pass virtual device's DMA request >> into Xen hypervisor. Qemu is required to use DMOP hypercall and tool stack >> may use domctl hyercall. vIOMMU hypercalls will be divided into two part. >> >> Domctl: >> create, destroy and query. >> DMOP: >> vDev's DMA related operations. >> >> Is this OK? >> > > Why are they divided into two libraries? Can't they be in DMOP at the > same time? Yes, we can use DMOP for all vIOMMU hyercalls if it's necessary to keep unified vIOMMU hyercall type. In theory, DMOP dedicates to be used by Qemu but we also can use it in tool stack. If we move create, destroy and query operation to tool stack, it isn't necessary to use DMOP for them since only tool stack will call them. This is why I said we could use domctl for these operations. Both two ways will not affect function implementation. Which one it's better from your view? :) > > Just asking questions, not suggesting it should be done one way or the > other. Sorry if there are some obvious reasons that I missed. > > Wei. > -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 2/23] DMOP: Introduce new DMOP commands for vIOMMU support
Hi Konrad: Thanks for your review. On 2017年04月17日 22:36, Konrad Rzeszutek Wilk wrote: > On Fri, Mar 17, 2017 at 07:27:02PM +0800, Lan Tianyu wrote: >> This patch is to introduce create, destroy and query capabilities >> command for vIOMMU. vIOMMU layer will deal with requests and call >> arch vIOMMU ops. >> >> Signed-off-by: Lan Tianyu >> --- >> xen/arch/x86/hvm/dm.c | 29 + >> xen/include/public/hvm/dm_op.h | 39 +++ >> 2 files changed, 68 insertions(+) >> >> diff --git a/xen/arch/x86/hvm/dm.c b/xen/arch/x86/hvm/dm.c >> index 2122c45..2b28f70 100644 >> --- a/xen/arch/x86/hvm/dm.c >> +++ b/xen/arch/x86/hvm/dm.c >> @@ -491,6 +491,35 @@ static int dm_op(domid_t domid, >> break; >> } >> >> +case XEN_DMOP_create_viommu: >> +{ >> +struct xen_dm_op_create_viommu *data = >> +&op.u.create_viommu; >> + >> +rc = viommu_create(d, data->base_address, data->length, >> data->capabilities); >> +if (rc >= 0) { > > The style guide is is to have a space here and { on a newline. Yes, will fix. > >> +data->viommu_id = rc; >> +rc = 0; >> +} >> +break; >> +} > > Newline here.. > > >> +case XEN_DMOP_destroy_viommu: >> +{ >> +const struct xen_dm_op_destroy_viommu *data = >> +&op.u.destroy_viommu; >> + >> +rc = viommu_destroy(d, data->viommu_id); >> +break; >> +} > > Ahem? >> +case XEN_DMOP_query_viommu_caps: >> +{ >> +struct xen_dm_op_query_viommu_caps *data = >> +&op.u.query_viommu_caps; >> + >> +data->caps = viommu_query_caps(d); >> +rc = 0; >> +break; >> +} > > And here. >> default: >> rc = -EOPNOTSUPP; >> break; >> diff --git a/xen/include/public/hvm/dm_op.h b/xen/include/public/hvm/dm_op.h >> index f54cece..b8c7359 100644 >> --- a/xen/include/public/hvm/dm_op.h >> +++ b/xen/include/public/hvm/dm_op.h >> @@ -318,6 +318,42 @@ struct xen_dm_op_inject_msi { >> uint64_aligned_t addr; >> }; >> >> +/* >> + * XEN_DMOP_create_viommu: Create vIOMMU device. >> + */ >> +#define XEN_DMOP_create_viommu 15 >> + >> +struct xen_dm_op_create_viommu { >> +/* IN - MMIO base address of vIOMMU */ > > Any limit? Can it be zero? In current patchset, base address is allocated by toolstack and passed to Qemu to create vIOMMU in hyervisor. Toolstack should make sure the range won't be conflicted with other resource. > >> +uint64_t base_address; >> +/* IN - Length of MMIO region */ > > Any restrictions? Can it be say 2 bytes? Or is this in page-size granularity? From the VTD spec, register size must be an integer multiple of 4KB and I think the vIOMMU device model(E,G vvtd) in hypervisor should check the lengh. Different vendor may have different restriction. > >> +uint64_t length; >> +/* IN - Capabilities with which we want to create */ >> +uint64_t capabilities; > > That sounds like some form of flags? Yes, this patchset just introduces interrupt remapping flag and other vendor also can use it to add new features. > >> +/* OUT - vIOMMU identity */ >> +uint32_t viommu_id; >> +}; >> + >> +/* >> + * XEN_DMOP_destroy_viommu: Destroy vIOMMU device. >> + */ >> +#define XEN_DMOP_destroy_viommu 16 >> + >> +struct xen_dm_op_destroy_viommu { >> +/* OUT - vIOMMU identity */ > > Out? Not in? Sorry, it should be OUT parameter. > >> +uint32_t viommu_id; >> +}; >> + >> +/* >> + * XEN_DMOP_q_viommu: Query vIOMMU capabilities. >> + */ >> +#define XEN_DMOP_query_viommu_caps 17 >> + >> +struct xen_dm_op_query_viommu_caps { >> +/* OUT - vIOMMU Capabilities*/ > > Don't you need to also mention which vIOMMU? As you > could have potentially many of them? If we want to support different vendors' vIOMMU, it's necessary to do that and we need to introduce a new field "vIOMMU type" (E,G Intel, AMD and ARM IOMMU). > >> +uint64_t caps; >> +}; >> + >> struct xen_dm_op { >> uint32_t op; >> uint32_t pad; >> @@ -336,6 +372,9 @@ struct xen_dm_op { >> struct xen_dm_op_set_mem_type set_mem_type; >> struct xen_dm_op_inject_event inject_event; >> struct xen_dm_op_inject_msi inject_msi; >> +struct xen_dm_op_create_viommu create_viommu; >> +struct xen_dm_op_destroy_viommu destroy_viommu; >> +struct xen_dm_op_query_viommu_caps query_viommu_caps; >> } u; >> }; >> >> -- >> 1.8.3.1 >> >> >> ___ >> Xen-devel mailing list >> Xen-devel@lists.xen.org >> https://lists.xen.org/xen-devel -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 3/23] VIOMMU: Add irq request callback to deal with irq remapping
On 2017年04月17日 22:39, Konrad Rzeszutek Wilk wrote: > On Fri, Mar 17, 2017 at 07:27:03PM +0800, Lan Tianyu wrote: >> This patch is to add irq request callback for platform implementation >> to deal with irq remapping request. >> >> Signed-off-by: Lan Tianyu >> --- >> xen/common/viommu.c | 11 +++ >> xen/include/asm-arm/viommu.h | 4 >> xen/include/asm-x86/viommu.h | 15 +++ >> xen/include/xen/viommu.h | 8 >> 4 files changed, 38 insertions(+) >> >> diff --git a/xen/common/viommu.c b/xen/common/viommu.c >> index 4c1c788..62c66db 100644 >> --- a/xen/common/viommu.c >> +++ b/xen/common/viommu.c >> @@ -87,6 +87,17 @@ u64 viommu_query_caps(struct domain *d) >> return info->ops->query_caps(d); >> } >> >> +int viommu_handle_irq_request(struct domain *d, >> +struct irq_remapping_request *request) >> +{ >> +struct viommu_info *info = &d->viommu; >> + >> +if ( !info || !info->ops || !info->ops->handle_irq_request) > > You are missing an space at the end. Yes, will fix. >> +return -EINVAL; >> + >> +return info->ops->handle_irq_request(d, request); >> +} >> + >> /* >> * Local variables: >> * mode: C >> diff --git a/xen/include/asm-arm/viommu.h b/xen/include/asm-arm/viommu.h >> index ef6a60b..6a81ecb 100644 >> --- a/xen/include/asm-arm/viommu.h >> +++ b/xen/include/asm-arm/viommu.h >> @@ -22,6 +22,10 @@ >> >> #include >> >> +struct irq_remapping_request >> +{ >> +}; >> + >> static inline const struct viommu_ops *viommu_get_ops(void) >> { >> return NULL; >> diff --git a/xen/include/asm-x86/viommu.h b/xen/include/asm-x86/viommu.h >> index efb435f..b6e01a5 100644 >> --- a/xen/include/asm-x86/viommu.h >> +++ b/xen/include/asm-x86/viommu.h >> @@ -23,6 +23,21 @@ >> #include >> #include >> >> +struct irq_remapping_request >> +{ >> +u8 type; >> +u16 source_id; >> +union { >> +/* MSI */ >> +struct { >> +u64 addr; >> +u32 data; >> +} msi; >> +/* Redirection Entry in IOAPIC */ >> +u64 rte; >> +} msg; >> +}; > > Will this work right? As in with the default padding and such? Sorry. Could you elaborate this? >> + >> static inline const struct viommu_ops *viommu_get_ops(void) >> { >> return NULL; >> diff --git a/xen/include/xen/viommu.h b/xen/include/xen/viommu.h >> index a0abbdf..246b29d 100644 >> --- a/xen/include/xen/viommu.h >> +++ b/xen/include/xen/viommu.h >> @@ -24,6 +24,10 @@ >> >> #define NR_VIOMMU_PER_DOMAIN 1 >> >> +/* IRQ request type */ >> +#define VIOMMU_REQUEST_IRQ_MSI 0 >> +#define VIOMMU_REQUEST_IRQ_APIC 1 > > What is this used for? This is to designate interrupt type of irq remapping request which contains in the structure irq_remapping_request. The vIOMMU device model uses it to parse request data. >> + >> struct viommu { >> u64 base_address; >> u64 length; >> @@ -36,6 +40,8 @@ struct viommu_ops { >> u64 (*query_caps)(struct domain *d); >> int (*create)(struct domain *d, struct viommu *viommu); >> int (*destroy)(struct viommu *viommu); >> +int (*handle_irq_request)(struct domain *d, >> + struct irq_remapping_request *request); >> }; >> >> struct viommu_info { >> @@ -48,6 +54,8 @@ int viommu_init_domain(struct domain *d); >> int viommu_create(struct domain *d, u64 base_address, u64 length, u64 caps); >> int viommu_destroy(struct domain *d, u32 viommu_id); >> u64 viommu_query_caps(struct domain *d); >> +int viommu_handle_irq_request(struct domain *d, >> + struct irq_remapping_request *request); >> >> #endif /* __XEN_VIOMMU_H__ */ >> >> -- >> 1.8.3.1 >> >> >> ___ >> Xen-devel mailing list >> Xen-devel@lists.xen.org >> https://lists.xen.org/xen-devel -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 00/23] xen/vIOMMU: Add vIOMMU support with irq remapping fucntion on Intel platform
On 2017年04月17日 22:41, Konrad Rzeszutek Wilk wrote: > On Mon, Mar 20, 2017 at 02:23:02PM +, Roger Pau Monné wrote: >> On Fri, Mar 17, 2017 at 07:27:00PM +0800, Lan Tianyu wrote: >>> This patchset is to introduce vIOMMU framework and add virtual VTD's >>> interrupt remapping support according "Xen virtual IOMMU high level >>> design doc >>> V3"(https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.xenproject.org_archives_html_xen-2Ddevel_&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PQcxBKCX5YTpkKY057SbK10&r=wAkdPB9j1dAH7AI494B5wFV3Jws7EfB2Q3Sw-K-88Rk&m=7dZfaODS8zbwpYC0vm7gKQXyM8pBPxfGpz8QMDQzU2k&s=3hxzmHH4X0gz9Oz5_PYoOmWFTkyETYTFPCqJ9iXD910&e= >>> >>> 2016-11/msg01391.html). > > It would be awesome if that was as a patch in docs/misc/ Will do that. > > Thanks. > >>> >>> - vIOMMU framework >>> New framework provides viommu_ops and help functions to abstract >>> vIOMMU operations(E,G create, destroy, handle irq remapping request >>> and so on). Vendors(Intel, ARM, AMD and son) can implement their >>> vIOMMU callbacks. >>> >>> - Xen vIOMMU device model in Qemu >>> It's in charge of create/destroy vIOMMU in hypervisor via new vIOMMU >>> DMOP hypercalls. It will be required to pass virtual devices DMA >>> request to hypervisor when enable IOVA(DMA request without PASID) >>> function. >>> >>> - Virtual VTD >>> In this patchset, we enable irq remapping function and covers both >>> MSI and IOAPIC interrupts. Don't support post interrupt mode emulation >>> and post interrupt mode enabled on host with virtual VTD. Will add >>> later. >>> >>> Chao Gao (19): >>> Tools/libxc: Add viommu operations in libxc >>> Tools/libacpi: Add DMA remapping reporting (DMAR) ACPI table >>> structures >>> Tools/libacpi: Add new fields in acpi_config to build DMAR table >>> Tools/libacpi: Add a user configurable parameter to control vIOMMU >>> attributes >>> Tools/libxl: Inform device model to create a guest with a vIOMMU >>> device >>> x86/hvm: Introduce a emulated VTD for HVM >>> X86/vvtd: Add MMIO handler for VVTD >>> X86/vvtd: Set Interrupt Remapping Table Pointer through GCMD >>> X86/vvtd: Process interrupt remapping request >>> X86/vvtd: decode interrupt attribute from IRTE >>> X86/vioapic: Hook interrupt delivery of vIOAPIC >>> X86/vvtd: Enable Queued Invalidation through GCMD >>> X86/vvtd: Enable Interrupt Remapping through GCMD >>> x86/vpt: Get interrupt vector through a vioapic interface >>> passthrough: move some fields of hvm_gmsi_info to a sub-structure >>> Tools/libxc: Add a new interface to bind msi-ir with pirq >>> X86/vmsi: Hook guest MSI injection >>> X86/vvtd: Handle interrupt translation faults >>> X86/vvtd: Add queued invalidation (QI) support >>> >>> Lan Tianyu (4): >>> VIOMMU: Add vIOMMU helper functions to create, destroy and query >>> capabilities >>> DMOP: Introduce new DMOP commands for vIOMMU support >>> VIOMMU: Add irq request callback to deal with irq remapping >>> VIOMMU: Add get irq info callback to convert irq remapping request >>> >>> tools/libacpi/acpi2_0.h | 45 + >>> tools/libacpi/build.c | 58 ++ >>> tools/libacpi/libacpi.h | 12 + >>> tools/libs/devicemodel/core.c | 69 ++ >>> tools/libs/devicemodel/include/xendevicemodel.h | 35 + >>> tools/libs/devicemodel/libxendevicemodel.map|3 + >>> tools/libxc/include/xenctrl.h | 17 + >>> tools/libxc/include/xenctrl_compat.h|5 + >>> tools/libxc/xc_devicemodel_compat.c | 18 + >>> tools/libxc/xc_domain.c | 55 + >>> tools/libxl/libxl_create.c | 12 +- >>> tools/libxl/libxl_dm.c |9 + >>> tools/libxl/libxl_dom.c | 85 ++ >>> tools/libxl/libxl_types.idl |8 + >>> tools/xl/xl_parse.c | 54 + >>> xen/arch/x86/Makefile |1 + >>> xen/arch/x86/hvm/Makefile |1 + >>> xen/arch/x86/hvm/dm.c | 29 + >>> xen/arch/x86/hvm/irq.c | 10 + >>&
Re: [Xen-devel] [RFC PATCH 15/23] X86/vioapic: Hook interrupt delivery of vIOAPIC
On 2017年04月17日 22:43, Konrad Rzeszutek Wilk wrote: > On Fri, Mar 17, 2017 at 07:27:15PM +0800, Lan Tianyu wrote: >> From: Chao Gao >> >> When irq remapping enabled, IOAPIC Redirection Entry maybe is in remapping >> format. If that, generate a irq_remapping_request and send it to domain. >> >> Signed-off-by: Chao Gao >> Signed-off-by: Lan Tianyu >> --- >> xen/arch/x86/Makefile | 1 + >> xen/arch/x86/hvm/vioapic.c | 10 ++ >> xen/arch/x86/viommu.c | 30 ++ >> xen/include/asm-x86/viommu.h | 3 +++ >> xen/include/public/arch-x86/hvm/save.h | 1 + >> 5 files changed, 45 insertions(+) >> create mode 100644 xen/arch/x86/viommu.c >> >> diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile >> index f75eca0..d49f8c8 100644 >> --- a/xen/arch/x86/Makefile >> +++ b/xen/arch/x86/Makefile >> @@ -66,6 +66,7 @@ obj-y += usercopy.o >> obj-y += x86_emulate.o >> obj-$(CONFIG_TBOOT) += tboot.o >> obj-y += hpet.o >> +obj-y += viommu.o >> obj-y += vm_event.o >> obj-y += xstate.o >> >> diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c >> index fdbb21f..6a00644 100644 >> --- a/xen/arch/x86/hvm/vioapic.c >> +++ b/xen/arch/x86/hvm/vioapic.c >> @@ -30,6 +30,7 @@ >> #include >> #include >> #include >> +#include >> #include >> #include >> #include >> @@ -285,9 +286,18 @@ static void vioapic_deliver(struct hvm_hw_vioapic >> *vioapic, int irq) >> struct domain *d = vioapic_domain(vioapic); >> struct vlapic *target; >> struct vcpu *v; >> +struct irq_remapping_request request; >> >> ASSERT(spin_is_locked(&d->arch.hvm_domain.irq_lock)); >> >> +if ( vioapic->redirtbl[irq].ir.format ) >> +{ >> +irq_request_ioapic_fill(&request, vioapic->id, >> +vioapic->redirtbl[irq].bits); >> +viommu_handle_irq_request(d, &request); >> +return; >> +} >> + >> HVM_DBG_LOG(DBG_LEVEL_IOAPIC, >> "dest=%x dest_mode=%x delivery_mode=%x " >> "vector=%x trig_mode=%x", >> diff --git a/xen/arch/x86/viommu.c b/xen/arch/x86/viommu.c >> new file mode 100644 >> index 000..ef78d3b >> --- /dev/null >> +++ b/xen/arch/x86/viommu.c >> @@ -0,0 +1,30 @@ >> +/* >> + * viommu.c >> + * >> + * virtualize IOMMU. >> + * >> + * Copyright (C) 2017 Chao Gao, Intel Corporation. >> + * >> + * This program is free software; you can redistribute it and/or >> + * modify it under the terms and conditions of the GNU General Public >> + * License, version 2, as published by the Free Software Foundation. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public >> + * License along with this program; If not, see >> <http://www.gnu.org/licenses/>. >> + */ >> + >> +#include >> + >> +void irq_request_ioapic_fill(struct irq_remapping_request *req, >> + uint32_t ioapic_id, uint64_t rte) >> +{ >> +ASSERT(req); >> +req->type = VIOMMU_REQUEST_IRQ_APIC; >> +req->source_id = ioapic_id; >> +req->msg.rte = rte; > > Considering we get 'req' from the stack and it may have garbage, would > it be good to fill out the rest of the entries with sensible values? Or > is there no need for that? Both AMD and Intel will use the function to pass interrupt remapping request. I am afraid different vendors may have different IOAPIC remapping format. How about to parse and check remapping request data in the vendor vIOMMU device module(E,G vvtd)? :) >> +} > > This being a new file, you should probably include the nice > editor configuration block. OK. Will add it. > >> diff --git a/xen/include/asm-x86/viommu.h b/xen/include/asm-x86/viommu.h >> index 0b25f34..fcf3c24 100644 >> --- a/xen/include/asm-x86/viommu.h >> +++ b/xen/include/asm-x86/viommu.h >> @@ -49,6 +49,9 @@ struct irq_remapping_request >> } msg; >> }; >> >> +void irq_request_ioapic_fill(struct irq_remapping_request *req, >
Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu operations in libxc
On 2017年04月18日 17:08, Paul Durrant wrote: >> -Original Message- >> From: Lan, Tianyu [mailto:tianyu@intel.com] >> Sent: 14 April 2017 16:38 >> To: Paul Durrant ; Wei Liu ; >> Kevin Tian ; Ian Jackson ; >> xen-devel@lists.xen.org >> Subject: Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu >> operations in libxc >> >> Hi Paul: >> Sorry for later response. >> >> On 3/31/2017 3:57 AM, Chao Gao wrote: >>> On Wed, Mar 29, 2017 at 09:08:06AM +, Paul Durrant wrote: >>>>> -Original Message- >>>>> From: Xen-devel [mailto:xen-devel-boun...@lists.xen.org] On Behalf >> Of >>>>> Chao Gao >>>>> Sent: 29 March 2017 01:40 >>>>> To: Wei Liu >>>>> Cc: Lan Tianyu ; Kevin Tian >> ; >>>>> Ian Jackson ; xen-devel@lists.xen.org >>>>> Subject: Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu >>>>> operations in libxc >>>>> >>>>> Tianyu is on vacation this two weeks, so I will try to address >>>>> some comments on this series. >>>>> >>>>> On Tue, Mar 28, 2017 at 05:24:03PM +0100, Wei Liu wrote: >>>>>> On Fri, Mar 17, 2017 at 07:27:05PM +0800, Lan Tianyu wrote: >>>>>>> From: Chao Gao >>>>>>> >>>>>>> In previous patch, we introduce a common vIOMMU layer. In our >> design, >>>>>>> we create/destroy vIOMMU through DMOP interface instead of >> creating >>>>> it >>>>>>> according to a config flag of domain. It makes it is possible >>>>>>> to create vIOMMU in device model or in tool stack. >>>>>>> >>>> >>>> I've not been following this closely so apologies if this has already been >> asked... >>>> >>>> Why would you need to create a vIOMMU instance in an external device >> model. >>>> Since the toolstack should be in control of the device model configuration >> why would it not know in advance that one was required? >>> >>> I assume your question is why we don't create a vIOMMU instance via >> hypercall in toolstack. >>> I think creating in toolstack is also ok and is easier to be reused by pvh. >>> >>> If Tianyu has no concern about this, will move this part to toolstack. >> >> We can move create/destroy vIOMMU in the tool stack but we still need to >> add such dummy vIOMMU device model in Qemu to pass virtual device's >> DMA >> request into Xen hypervisor. > > Not quite sure I understand this. The QEMu device model does not 'pass DMA > requests' as such, it maps guest RAM and reads or writes to emulate DMA, > right? So, what's needed is a mechanism to map guest RAM by 'bus address'... > i.e. an address that will need to be translated through the vIOMMU mappings. > This is just an evolution of the current 'priv mapping' operations that allow > guest RAM to be mapped by guest physical address. So you don't need a vIOMMU > 'device model' as such, do you? Guest also may enable DMA protection mechanism in linux kernel which limits address space of emulated device and this depends on the vIOMMU's DMA translation function. In vIOMMU's MMIO emulation part is in the Xen hypersior and the guest shadow IO page table will be only in the hypervisor. To translate emulated device's DMA request. It's necessary to pass the DMA request to hypervisor. So far we don't support DMA translation and so doesn't pass DMA request. Map/umap guest memory already support in Qemu and just like emulated device model access guest memory. Qemu also provides vIOMMU hook to receive DMA request and return target guest address. vIOMMU framework will read/write target address. What we need to do is to translate DMA request to target address according shadow IO page table in the hypervisor. > >> Qemu is required to use DMOP hypercall and >> tool stack may use domctl hyercall. vIOMMU hypercalls will be divided >> into two part. >> >> Domctl: >> create, destroy and query. >> DMOP: >> vDev's DMA related operations. > > Yes, the mapping/unmapping operations should be DMOPs and IMO should be > designed such that they can be unified with replacements for current 'priv > map' ops such that QEMU can use the same function call, but with different > address space identifiers (i.e. bus address, guest physical address, etc.). > BTW, I say 'etc.' becau
Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu operations in libxc
On 2017年04月18日 22:15, Paul Durrant wrote: >> -Original Message- > [snip] >>> > > >>> > > Not quite sure I understand this. The QEMu device model does not 'pass >> > DMA requests' as such, it maps guest RAM and reads or writes to emulate >> > DMA, right? So, what's needed is a mechanism to map guest RAM by 'bus >> > address'... i.e. an address that will need to be translated through the >> > vIOMMU mappings. This is just an evolution of the current 'priv mapping' >> > operations that allow guest RAM to be mapped by guest physical address. So >> > you don't need a vIOMMU 'device model' as such, do you? >> > >> > >> > Guest also may enable DMA protection mechanism in linux kernel which >> > limits address space of emulated device and this depends on the vIOMMU's >> > DMA translation function. In vIOMMU's MMIO emulation part is in the Xen >> > hypersior and the guest shadow IO page table will be only in the >> > hypervisor. To translate emulated device's DMA request. It's necessary >> > to pass the DMA request to hypervisor. >> > > What do you mean by DMA request though? Are you intending to make some form > of hypercall to read or write guest memory? If so then why not introduce a > call to map the guest memory (via bus address) and read or write directly. Such "DMA request" in Qemu vIOMMU framework just contains IOVA(IO virtual address) and write/read flag. vIOMMU device model just translates IOVA to GPA and then return back to vIOMMU core which will be in charge of memory access. So hyercall we want to introduce is to translate IOVA to GPA. The data to write and target address to store read data aren't passed to vIOMMU device model and we can't perform read/write directly there. >> > So far we don't support DMA translation and so doesn't pass DMA request. >> > > Indeed. We map guest memory using guest physical address because, without an > emulated IOMMU, guest physical address === bus address. This is why I suggest > a new mapping operation rather than 'passing a DMA request' to the hypervisor. > >> > Map/umap guest memory already support in Qemu and just like emulated >> > device model access guest memory. Qemu also provides vIOMMU hook to >> > receive DMA request and return target guest address. vIOMMU framework >> > will read/write target address. > That's the part I don't get... why have the vIOMMU code do the reads and > writes? Why not have it provide a mapping function and then have the device > model in QEMU read and write directly as it does now? > Actually it's common interface in Qemu to read/write guest memory. The code will check whether there is a vIOMMU translation callback or not before performing read/write. If yes, call the callback and vIOMMU device model translate IOVA to GPA and then do read/write operation. >> > What we need to do is to translate DMA >> > request to target address according shadow IO page table in the hypervisor. >> > > Yes, so the mapping has to be done by the hypervisor (as is the case for priv > mapping or grant mapping) but the memory accesses themselves can be done > directly by the device model in QEMU. Yes. > >> > >> > >>> > > > >> Qemu is required to use DMOP hypercall and > >> tool stack may use domctl hyercall. vIOMMU hypercalls will be divided > >> into two part. > >> > >> Domctl: > >> create, destroy and query. > >> DMOP: > >> vDev's DMA related operations. >>> > > >>> > > Yes, the mapping/unmapping operations should be DMOPs and IMO >> > should be designed such that they can be unified with replacements for >> > current 'priv map' ops such that QEMU can use the same function call, but >> > with different address space identifiers (i.e. bus address, guest physical >> > address, etc.). BTW, I say 'etc.' because we should also consider mapping >> > the >> > ioreq pages from Xen using the same call - with a dedicated address space >> > identifier - as well. >>> > > >> > >> > So you agree to divide vIOMMU's hypercalls into two parts(DMOP and >> > Domctl), right? >> > > Yes, I agree with the logic of the split. > > Cheers, > >Paul > -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu operations in libxc
On 2017年05月11日 20:35, Wei Liu wrote: > On Mon, Apr 17, 2017 at 08:01:56PM +0800, Lan Tianyu wrote: >> On 2017年04月17日 19:08, Wei Liu wrote: >>> On Fri, Apr 14, 2017 at 11:38:15PM +0800, Lan, Tianyu wrote: >>>> Hi Paul: >>>>Sorry for later response. >>>> >>>> On 3/31/2017 3:57 AM, Chao Gao wrote: >>>>> On Wed, Mar 29, 2017 at 09:08:06AM +, Paul Durrant wrote: >>>>>>> -Original Message- >>>>>>> From: Xen-devel [mailto:xen-devel-boun...@lists.xen.org] On Behalf Of >>>>>>> Chao Gao >>>>>>> Sent: 29 March 2017 01:40 >>>>>>> To: Wei Liu >>>>>>> Cc: Lan Tianyu ; Kevin Tian >>>>>>> ; >>>>>>> Ian Jackson ; xen-devel@lists.xen.org >>>>>>> Subject: Re: [Xen-devel] [RFC PATCH 5/23] Tools/libxc: Add viommu >>>>>>> operations in libxc >>>>>>> >>>>>>> Tianyu is on vacation this two weeks, so I will try to address >>>>>>> some comments on this series. >>>>>>> >>>>>>> On Tue, Mar 28, 2017 at 05:24:03PM +0100, Wei Liu wrote: >>>>>>>> On Fri, Mar 17, 2017 at 07:27:05PM +0800, Lan Tianyu wrote: >>>>>>>>> From: Chao Gao >>>>>>>>> >>>>>>>>> In previous patch, we introduce a common vIOMMU layer. In our design, >>>>>>>>> we create/destroy vIOMMU through DMOP interface instead of creating >>>>>>> it >>>>>>>>> according to a config flag of domain. It makes it is possible >>>>>>>>> to create vIOMMU in device model or in tool stack. >>>>>>>>> >>>>>> >>>>>> I've not been following this closely so apologies if this has already >>>>>> been asked... >>>>>> >>>>>> Why would you need to create a vIOMMU instance in an external device >>>>>> model. >>>>>> Since the toolstack should be in control of the device model >>>>>> configuration why would it not know in advance that one was required? >>>>> >>>>> I assume your question is why we don't create a vIOMMU instance via >>>>> hypercall in toolstack. >>>>> I think creating in toolstack is also ok and is easier to be reused by >>>>> pvh. >>>>> >>>>> If Tianyu has no concern about this, will move this part to toolstack. >>>> >>>> We can move create/destroy vIOMMU in the tool stack but we still need to >>>> add >>>> such dummy vIOMMU device model in Qemu to pass virtual device's DMA request >>>> into Xen hypervisor. Qemu is required to use DMOP hypercall and tool stack >>>> may use domctl hyercall. vIOMMU hypercalls will be divided into two part. >>>> >>>> Domctl: >>>>create, destroy and query. >>>> DMOP: >>>>vDev's DMA related operations. >>>> >>>> Is this OK? >>>> >>> >>> Why are they divided into two libraries? Can't they be in DMOP at the >>> same time? >> >> Yes, we can use DMOP for all vIOMMU hyercalls if it's necessary to keep >> unified vIOMMU hyercall type. In theory, DMOP dedicates to be used by >> Qemu but we also can use it in tool stack. If we move create, destroy >> and query operation to tool stack, it isn't necessary to use DMOP for >> them since only tool stack will call them. This is why I said we could >> use domctl for these operations. Both two ways will not affect function >> implementation. Which one it's better from your view? :) >> > > > After reading the subthread I think I agree with Paul. I.e. please > separate them. > Sure. Will update. -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH V3 1/3] Xen: Increase hap/shadow page pool size to support more vcpus support
Hi Wei: On 2017年09月18日 21:06, Wei Liu wrote: > On Wed, Sep 13, 2017 at 12:52:47AM -0400, Lan Tianyu wrote: >> This patch is to increase page pool size when max vcpu number is larger >> than 128. >> >> Signed-off-by: Lan Tianyu >> --- >> xen/arch/arm/domain.c| 5 + >> xen/arch/x86/domain.c| 25 + >> xen/common/domctl.c | 3 +++ >> xen/include/xen/domain.h | 2 ++ >> 4 files changed, 35 insertions(+) >> >> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c >> index 6512f01..94cf70b 100644 >> --- a/xen/arch/arm/domain.c >> +++ b/xen/arch/arm/domain.c >> @@ -824,6 +824,11 @@ int arch_vcpu_reset(struct vcpu *v) >> return 0; >> } >> >> +int arch_domain_set_max_vcpus(struct domain *d) >> +{ >> +return 0; >> +} >> + >> static int relinquish_memory(struct domain *d, struct page_list_head *list) >> { >> struct page_info *page, *tmp; >> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c >> index dbddc53..0e230f9 100644 >> --- a/xen/arch/x86/domain.c >> +++ b/xen/arch/x86/domain.c >> @@ -1161,6 +1161,31 @@ int arch_vcpu_reset(struct vcpu *v) >> return 0; >> } >> >> +int arch_domain_set_max_vcpus(struct domain *d) > > The name doesn't match what the function does. > I originally hoped to introduce a hook for each arch when set max vcpus. Each arch function can do customized thing and so named "arch_domain_set_max_vcpus". How about "arch_domain_setup_vcpus_resource"? >> +{ >> +int ret; >> + >> +/* Increase page pool in order to support more vcpus. */ >> +if ( d->max_vcpus > 128 ) >> +{ >> +unsigned long nr_pages; >> + >> +if (hap_enabled(d)) > > Coding style. Will update. Thanks. > >> +nr_pages = 1024; >> +else >> +nr_pages = 4096; >> + >> +ret = paging_set_allocation(d, nr_pages, NULL); > > Does this work on PV guests? Sorry. This code should not run for PV guest. Will add a domain type check here. > >> +if ( ret != 0 ) >> +{ >> +paging_set_allocation(d, 0, NULL); >> +return ret; >> +} >> +} >> + >> +return 0; >> +} >> + >> long >> arch_do_vcpu_op( >> int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) >> diff --git a/xen/common/domctl.c b/xen/common/domctl.c >> index 42658e5..64357a3 100644 >> --- a/xen/common/domctl.c >> +++ b/xen/common/domctl.c >> @@ -631,6 +631,9 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) >> u_domctl) >> d->max_vcpus = max; >> } >> >> +if ( arch_domain_set_max_vcpus(d) < 0) > > != 0 please. > Sure. >> +goto maxvcpu_out; >> + >> for ( i = 0; i < max; i++ ) >> { >> if ( d->vcpu[i] != NULL ) >> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h >> index 347f264..e1ece3a 100644 >> --- a/xen/include/xen/domain.h >> +++ b/xen/include/xen/domain.h >> @@ -81,6 +81,8 @@ void arch_dump_domain_info(struct domain *d); >> >> int arch_vcpu_reset(struct vcpu *); >> >> +int arch_domain_set_max_vcpus(struct domain *d); >> + >> extern spinlock_t vcpu_alloc_lock; >> bool_t domctl_lock_acquire(void); >> void domctl_lock_release(void); >> -- >> 1.8.3.1 >> -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
Re: [Xen-devel] [RFC PATCH V3 1/3] Xen: Increase hap/shadow page pool size to support more vcpus support
On 2017年09月20日 23:13, Wei Liu wrote: > On Tue, Sep 19, 2017 at 11:06:26AM +0800, Lan Tianyu wrote: >> Hi Wei: >> >> On 2017年09月18日 21:06, Wei Liu wrote: >>> On Wed, Sep 13, 2017 at 12:52:47AM -0400, Lan Tianyu wrote: >>>> This patch is to increase page pool size when max vcpu number is larger >>>> than 128. >>>> >>>> Signed-off-by: Lan Tianyu >>>> --- >>>> xen/arch/arm/domain.c| 5 + >>>> xen/arch/x86/domain.c| 25 + >>>> xen/common/domctl.c | 3 +++ >>>> xen/include/xen/domain.h | 2 ++ >>>> 4 files changed, 35 insertions(+) >>>> >>>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c >>>> index 6512f01..94cf70b 100644 >>>> --- a/xen/arch/arm/domain.c >>>> +++ b/xen/arch/arm/domain.c >>>> @@ -824,6 +824,11 @@ int arch_vcpu_reset(struct vcpu *v) >>>> return 0; >>>> } >>>> >>>> +int arch_domain_set_max_vcpus(struct domain *d) >>>> +{ >>>> +return 0; >>>> +} >>>> + >>>> static int relinquish_memory(struct domain *d, struct page_list_head >>>> *list) >>>> { >>>> struct page_info *page, *tmp; >>>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c >>>> index dbddc53..0e230f9 100644 >>>> --- a/xen/arch/x86/domain.c >>>> +++ b/xen/arch/x86/domain.c >>>> @@ -1161,6 +1161,31 @@ int arch_vcpu_reset(struct vcpu *v) >>>> return 0; >>>> } >>>> >>>> +int arch_domain_set_max_vcpus(struct domain *d) >>> >>> The name doesn't match what the function does. >>> >> >> I originally hoped to introduce a hook for each arch when set max vcpus. >> Each arch function can do customized thing and so named >> "arch_domain_set_max_vcpus". >> >> How about "arch_domain_setup_vcpus_resource"? > > Before you go away and do a lot of work, please let us think about if > this is the right approach first. Sure. This idea that increase page pool when set max vcpu is from Jan. Jan, Could you help to check whether current patch is right approach? Thanks. > > We are close to freeze, with the amount of patches we receive everyday > RFC patch like this one is low on my (can't speak for others) priority > list. I am not sure when I will be able to get back to this, but do ping > us if you want to know where things stand. > -- Best regards Tianyu Lan ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 1/29] Xen/doc: Add Xen virtual IOMMU doc
This patch is to add Xen virtual IOMMU doc to introduce motivation, framework, vIOMMU hypercall and xl configuration. Signed-off-by: Lan Tianyu --- docs/misc/viommu.txt | 136 +++ 1 file changed, 136 insertions(+) create mode 100644 docs/misc/viommu.txt diff --git a/docs/misc/viommu.txt b/docs/misc/viommu.txt new file mode 100644 index 000..348e8c4 --- /dev/null +++ b/docs/misc/viommu.txt @@ -0,0 +1,136 @@ +Xen virtual IOMMU + +Motivation +== +Enable more than 128 vcpu support + +The current requirements of HPC cloud service requires VM with a high +number of CPUs in order to achieve high performance in parallel +computing. + +To support >128 vcpus, X2APIC mode in guest is necessary because legacy +APIC(XAPIC) just supports 8-bit APIC ID. The APIC ID used by Xen is +CPU ID * 2 (ie: CPU 127 has APIC ID 254, which is the last one available +in xAPIC mode) and so it only can support 128 vcpus at most. x2APIC mode +supports 32-bit APIC ID and it requires the interrupt remapping functionality +of a vIOMMU if the guest wishes to route interrupts to all available vCPUs + +The reason for this is that there is no modification for existing PCI MSI +and IOAPIC when introduce X2APIC. PCI MSI/IOAPIC can only send interrupt +message containing 8-bit APIC ID, which cannot address cpus with >254 +APIC ID. Interrupt remapping supports 32-bit APIC ID and so it's necessary +for >128 vcpus support. + + +vIOMMU Architecture +=== +vIOMMU device model is inside Xen hypervisor for following factors +1) Avoid round trips between Qemu and Xen hypervisor +2) Ease of integration with the rest of hypervisor +3) HVMlite/PVH doesn't use Qemu + +* Interrupt remapping overview. +Interrupts from virtual devices and physical devices are delivered +to vLAPIC from vIOAPIC and vMSI. vIOMMU needs to remap interrupt during +this procedure. + ++---+ +|Qemu |VM | +| | ++| +| | | Device driver || +| | ++---+| +| | ^| +| ++ | ++---+| +| | Virtual device | | | IRQ subsystem || +| +---++ | ++---+| +| | | ^| +| | | || ++---+---+ +|hypervisor | | VIRQ | +| |+-++ | +| || vLAPIC | | +| |VIRQ+-++ | +| | ^| +| | || +| |+-++ | +| || vIOMMU | | +| |+-++ | +| | ^| +| | || +| |+-++ | +| || vIOAPIC/vMSI | | +| |++++ | +| | ^^| +| +-+|| +| || ++---+ +HW |IRQ ++---+ +| PCI Device | ++---+ + + +vIOMMU hypercall + +Introduce a new domctl hypercall "xen_domctl_viommu_op" to create/destroy +vIOMMUs. + +* vIOMMU hypercall parameter structure + +/* vIOMMU type - specify vendor vIOMMU device model */ +#define VIOMMU_TYPE_INTEL_VTD 0 + +/* vIOMMU capabilities */ +#define VIOMMU_CAP_IRQ_REMAPPING (1u << 0) + +struct xen_domctl_viommu_op { +uint32_t cmd; +#define XEN_DOMCTL_create_viommu 0 +#define XEN_DOMCTL_destroy_viommu 1 +union { +struct { +/* IN - vIOMMU type */ +uint64_t viommu_type; +/* IN - MMIO base address of vIOMMU. */ +uint64_t base_address; +/* IN - Capabilities with which we want to create */ +uint64_t capabilities; +/* OUT - vIOMMU identity */ +uint32_t viommu_id; +} create_viommu; + +struct { +/* IN - vIOMMU identity */ +uint32_t viommu_id; +} destroy_viommu; +} u; +}; + +- XEN_DOMCTL_create_viommu +Create vIOMMU device with vIOMMU_type, capabilities and MMIO base +address. Hypervisor allocates viommu_id for new vIOMMU instance
[Xen-devel] [PATCH V3 3/29] DOMCTL: Introduce new DOMCTL commands for vIOMMU support
This patch is to introduce create, destroy and query capabilities command for vIOMMU. vIOMMU layer will deal with requests and call arch vIOMMU ops. Signed-off-by: Lan Tianyu --- xen/common/domctl.c | 6 ++ xen/common/viommu.c | 30 ++ xen/include/public/domctl.h | 42 ++ xen/include/xen/viommu.h| 2 ++ 4 files changed, 80 insertions(+) diff --git a/xen/common/domctl.c b/xen/common/domctl.c index 42658e5..7e28237 100644 --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -1149,6 +1149,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) copyback = 1; break; +#ifdef CONFIG_VIOMMU +case XEN_DOMCTL_viommu_op: +ret = viommu_domctl(d, &op->u.viommu_op, ©back); +break; +#endif + default: ret = arch_do_domctl(op, d, u_domctl); break; diff --git a/xen/common/viommu.c b/xen/common/viommu.c index 64d91e6..55feb5d 100644 --- a/xen/common/viommu.c +++ b/xen/common/viommu.c @@ -133,6 +133,36 @@ static int viommu_create(struct domain *d, uint64_t type, return 0; } +int viommu_domctl(struct domain *d, struct xen_domctl_viommu_op *op, + bool *need_copy) +{ +int rc = -EINVAL; + +if ( !viommu_enabled() ) +return -ENODEV; + +switch ( op->cmd ) +{ +case XEN_DOMCTL_create_viommu: +rc = viommu_create(d, op->u.create.viommu_type, + op->u.create.base_address, + op->u.create.capabilities, + &op->u.create.viommu_id); +if ( !rc ) +*need_copy = true; +break; + +case XEN_DOMCTL_destroy_viommu: +rc = viommu_destroy_domain(d); +break; + +default: +return -ENOSYS; +} + +return rc; +} + /* * Local variables: * mode: C diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index 50ff58f..68854b6 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -1163,6 +1163,46 @@ struct xen_domctl_psr_cat_op { typedef struct xen_domctl_psr_cat_op xen_domctl_psr_cat_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_psr_cat_op_t); +/* vIOMMU helper + * + * vIOMMU interface can be used to create/destroy vIOMMU and + * query vIOMMU capabilities. + */ + +/* vIOMMU type - specify vendor vIOMMU device model */ +#define VIOMMU_TYPE_INTEL_VTD 0 + +/* vIOMMU capabilities */ +#define VIOMMU_CAP_IRQ_REMAPPING (1u << 0) + +struct xen_domctl_viommu_op { +uint32_t cmd; +#define XEN_DOMCTL_create_viommu 0 +#define XEN_DOMCTL_destroy_viommu 1 +union { +struct { +/* IN - vIOMMU type */ +uint64_t viommu_type; +/* + * IN - MMIO base address of vIOMMU. vIOMMU device models + * are in charge of to check base_address. + */ +uint64_t base_address; +/* IN - Capabilities with which we want to create */ +uint64_t capabilities; +/* OUT - vIOMMU identity */ +uint32_t viommu_id; +} create; + +struct { +/* IN - vIOMMU identity */ +uint32_t viommu_id; +} destroy; +} u; +}; +typedef struct xen_domctl_viommu_op xen_domctl_viommu_op; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_viommu_op); + struct xen_domctl { uint32_t cmd; #define XEN_DOMCTL_createdomain 1 @@ -1240,6 +1280,7 @@ struct xen_domctl { #define XEN_DOMCTL_monitor_op77 #define XEN_DOMCTL_psr_cat_op78 #define XEN_DOMCTL_soft_reset79 +#define XEN_DOMCTL_viommu_op 80 #define XEN_DOMCTL_gdbsx_guestmemio1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 @@ -1302,6 +1343,7 @@ struct xen_domctl { struct xen_domctl_psr_cmt_oppsr_cmt_op; struct xen_domctl_monitor_opmonitor_op; struct xen_domctl_psr_cat_oppsr_cat_op; +struct xen_domctl_viommu_op viommu_op; uint8_t pad[128]; } u; }; diff --git a/xen/include/xen/viommu.h b/xen/include/xen/viommu.h index 636a2a3..baa8ab7 100644 --- a/xen/include/xen/viommu.h +++ b/xen/include/xen/viommu.h @@ -43,6 +43,8 @@ static inline bool viommu_enabled(void) int viommu_register_type(uint64_t type, struct viommu_ops *ops); int viommu_destroy_domain(struct domain *d); +int viommu_domctl(struct domain *d, struct xen_domctl_viommu_op *op, + bool_t *need_copy); #else static inline int viommu_register_type(uint64_t type, struct viommu_ops *ops) { -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 2/29] VIOMMU: Add vIOMMU helper functions to create, destroy vIOMMU instance
This patch is to introduce an abstract layer for arch vIOMMU implementation to deal with requests from dom0. Arch vIOMMU code needs to provide callback to do create and destroy operation. Signed-off-by: Lan Tianyu --- docs/misc/xen-command-line.markdown | 7 ++ xen/arch/x86/Kconfig| 1 + xen/common/Kconfig | 3 + xen/common/Makefile | 1 + xen/common/domain.c | 4 + xen/common/viommu.c | 144 xen/include/xen/sched.h | 8 ++ xen/include/xen/viommu.h| 63 8 files changed, 231 insertions(+) create mode 100644 xen/common/viommu.c create mode 100644 xen/include/xen/viommu.h diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index 9797c8d..dfd1db5 100644 --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -1825,3 +1825,10 @@ mode. > Default: `true` Permit use of the `xsave/xrstor` instructions. + +### viommu +> `= ` + +> Default: `false` + +Permit use of viommu interface to create and destroy viommu device model. diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig index 30c2769..1f1de96 100644 --- a/xen/arch/x86/Kconfig +++ b/xen/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 select HAS_PDX select NUMA select VGA + select VIOMMU config ARCH_DEFCONFIG string diff --git a/xen/common/Kconfig b/xen/common/Kconfig index dc8e876..2ad2c8d 100644 --- a/xen/common/Kconfig +++ b/xen/common/Kconfig @@ -49,6 +49,9 @@ config HAS_CHECKPOLICY string option env="XEN_HAS_CHECKPOLICY" +config VIOMMU + bool + config KEXEC bool "kexec support" default y diff --git a/xen/common/Makefile b/xen/common/Makefile index 39e2614..da32f71 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -56,6 +56,7 @@ obj-y += time.o obj-y += timer.o obj-y += trace.o obj-y += version.o +obj-$(CONFIG_VIOMMU) += viommu.o obj-y += virtual_region.o obj-y += vm_event.o obj-y += vmap.o diff --git a/xen/common/domain.c b/xen/common/domain.c index 5aebcf2..cdb1c9d 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -814,6 +814,10 @@ static void complete_domain_destroy(struct rcu_head *head) sched_destroy_domain(d); +#ifdef CONFIG_VIOMMU +viommu_destroy_domain(d); +#endif + /* Free page used by xen oprofile buffer. */ #ifdef CONFIG_XENOPROF free_xenoprof_pages(d); diff --git a/xen/common/viommu.c b/xen/common/viommu.c new file mode 100644 index 000..64d91e6 --- /dev/null +++ b/xen/common/viommu.c @@ -0,0 +1,144 @@ +/* + * common/viommu.c + * + * Copyright (c) 2017 Intel Corporation + * Author: Lan Tianyu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; If not, see <http://www.gnu.org/licenses/>. + */ + +#include +#include +#include +#include + +bool __read_mostly opt_viommu; +boolean_param("viommu", opt_viommu); + +static DEFINE_SPINLOCK(type_list_lock); +static LIST_HEAD(type_list); + +struct viommu_type { +uint64_t type; +struct viommu_ops *ops; +struct list_head node; +}; + +int viommu_destroy_domain(struct domain *d) +{ +int ret; + +if ( !d->viommu ) +return -EINVAL; + +ret = d->viommu->ops->destroy(d->viommu); +if ( ret < 0 ) +return ret; + +xfree(d->viommu); +d->viommu = NULL; +return 0; +} + +static struct viommu_type *viommu_get_type(uint64_t type) +{ +struct viommu_type *viommu_type = NULL; + +spin_lock(&type_list_lock); +list_for_each_entry( viommu_type, &type_list, node ) +{ +if ( viommu_type->type == type ) +{ +spin_unlock(&type_list_lock); +return viommu_type; +} +} +spin_unlock(&type_list_lock); + +return NULL; +} + +int viommu_register_type(uint64_t type, struct viommu_ops *ops) +{ +struct viommu_type *viommu_type = NULL; + +if ( !viommu_enabled() ) +return -ENODEV; + +if ( viommu_get_type(type) ) +return -EEXIST; + +viommu_type = xzalloc(struct viommu_type); +if ( !viommu_type ) +return -ENOMEM; + +viommu_type->type = type; +viommu_type->ops = ops; + +spin_lock(&type_list_lock); +list_add_tail(&viommu_type->node, &type_list); +
[Xen-devel] [PATCH V3 7/29] tools/libxl: build DMAR table for a guest with one virtual VTD
From: Chao Gao A new logic is added to build ACPI DMAR table in tool stack for a guest with one virtual VTD and pass through it to guest via existing mechanism. If there already are ACPI tables needed to pass through, we joint the tables. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- v3: - build dmar and initialize related acpi_modules struct in libxl_x86_acpi.c, keeping in accordance with pvh. --- tools/libxl/libxl_x86.c | 3 +- tools/libxl/libxl_x86_acpi.c | 98 ++-- 2 files changed, 96 insertions(+), 5 deletions(-) diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c index 455f6f0..23c9a55 100644 --- a/tools/libxl/libxl_x86.c +++ b/tools/libxl/libxl_x86.c @@ -381,8 +381,7 @@ int libxl__arch_domain_finalise_hw_description(libxl__gc *gc, { int rc = 0; -if ((info->type == LIBXL_DOMAIN_TYPE_HVM) && -(info->device_model_version == LIBXL_DEVICE_MODEL_VERSION_NONE)) { +if (info->type == LIBXL_DOMAIN_TYPE_HVM) { rc = libxl__dom_load_acpi(gc, info, dom); if (rc != 0) LOGE(ERROR, "libxl_dom_load_acpi failed"); diff --git a/tools/libxl/libxl_x86_acpi.c b/tools/libxl/libxl_x86_acpi.c index 1761756..adf02f4 100644 --- a/tools/libxl/libxl_x86_acpi.c +++ b/tools/libxl/libxl_x86_acpi.c @@ -16,6 +16,7 @@ #include "libxl_arch.h" #include #include +#include "libacpi/acpi2_0.h" #include "libacpi/libacpi.h" #include @@ -161,9 +162,9 @@ out: return rc; } -int libxl__dom_load_acpi(libxl__gc *gc, - const libxl_domain_build_info *b_info, - struct xc_dom_image *dom) +static int libxl__dom_load_acpi_pvh(libxl__gc *gc, +const libxl_domain_build_info *b_info, +struct xc_dom_image *dom) { struct acpi_config config = {0}; struct libxl_acpi_ctxt libxl_ctxt; @@ -236,6 +237,97 @@ out: return rc; } +static void *acpi_memalign(struct acpi_ctxt *ctxt, uint32_t size, + uint32_t align) +{ +int ret; +void *ptr; + +ret = posix_memalign(&ptr, align, size); +if (ret != 0 || !ptr) +return NULL; + +return ptr; +} + +/* + * For hvm, we don't need build acpi in libxl. Instead, it's built in hvmloader. + * But if one hvm has virtual VTD(s), we build DMAR table for it and joint this + * table with existing content in acpi_modules in order to employ HVM + * firmware pass-through mechanism to pass-through DMAR table. + */ +static int libxl__dom_load_acpi_hvm(libxl__gc *gc, +const libxl_domain_build_info *b_info, +struct xc_dom_image *dom) +{ +struct acpi_config config = { 0 }; +struct acpi_ctxt ctxt; +void *table; +uint32_t len; + +if ((b_info->type != LIBXL_DOMAIN_TYPE_HVM) || +(b_info->device_model_version == LIBXL_DEVICE_MODEL_VERSION_NONE) || +(b_info->num_viommus != 1) || +(b_info->viommu[0].type != LIBXL_VIOMMU_TYPE_INTEL_VTD)) +return 0; + +ctxt.mem_ops.alloc = acpi_memalign; +ctxt.mem_ops.v2p = virt_to_phys; +ctxt.mem_ops.free = acpi_mem_free; + +if (libxl_defbool_val(b_info->viommu[0].intremap)) +config.iommu_intremap_supported = true; +/* x2apic is always enabled since in no case we must disable it */ +config.iommu_x2apic_supported = true; +config.iommu_base_addr = b_info->viommu[0].base_addr; + +/* IOAPIC id and PSEUDO BDF */ +config.ioapic_id = 1; +config.ioapic_bus = 0xff; +config.ioapic_devfn = 0x0; + +config.host_addr_width = 39; + +table = construct_dmar(&ctxt, &config); +if ( !table ) +return ERROR_NOMEM; +len = ((struct acpi_header *)table)->length; + +if (len) { +libxl__ptr_add(gc, table); +if (!dom->acpi_modules[0].data) { +dom->acpi_modules[0].data = table; +dom->acpi_modules[0].length = len; +} else { +/* joint tables */ +void *newdata; + +newdata = libxl__malloc(gc, len + dom->acpi_modules[0].length); +memcpy(newdata, dom->acpi_modules[0].data, + dom->acpi_modules[0].length); +memcpy(newdata + dom->acpi_modules[0].length, table, len); + +free(dom->acpi_modules[0].data); +dom->acpi_modules[0].data = newdata; +dom->acpi_modules[0].length += len; +} +} +return 0; +} + +int libxl__dom_load_acpi(libxl__gc *gc, + const libxl_domain_build_info *b_info, + struct xc_dom_image *dom) +{ + +if (b_info->type != LIBXL_DOMAIN_TYPE_HVM) +return 0; + +if (b_info->device_model_version == LIBXL_DEVICE_MODEL_VERSION_NONE) +return li
[Xen-devel] [PATCH V3 6/29] tools/libxl: Add a user configurable parameter to control vIOMMU attributes
From: Chao Gao A field, viommu_info, is added to struct libxl_domain_build_info. Several attributes can be specified by guest config file for virtual IOMMU. These attributes are used for DMAR construction and vIOMMU creation. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- v3: - allow an array of viommu other than only one viommu to present to guest. During domain building, an error would be raised for multiple viommus case since we haven't implemented this yet. - provide a libxl__viommu_set_default() for viommu --- docs/man/xl.cfg.pod.5.in| 27 +++ tools/libxl/libxl_create.c | 52 + tools/libxl/libxl_types.idl | 12 +++ tools/xl/xl_parse.c | 52 - 4 files changed, 142 insertions(+), 1 deletion(-) diff --git a/docs/man/xl.cfg.pod.5.in b/docs/man/xl.cfg.pod.5.in index 79cb2ea..9cd7dd7 100644 --- a/docs/man/xl.cfg.pod.5.in +++ b/docs/man/xl.cfg.pod.5.in @@ -1547,6 +1547,33 @@ L<http://www.microsoft.com/en-us/download/details.aspx?id=30707> =back +=item B + +Specifies the vIOMMUs which are to be provided to the guest. + +B has the form C where: + +=over 4 + +=item B + +Possible Bs are: + +=over 4 + +=item B + +Currently there is only one valid type: + +(x86 only) "intel_vtd" means providing a emulated Intel VT-d to the guest. + +=item B + +Specifies whether the vIOMMU should support interrupt remapping +and default 'true'. + +=back + =head3 Guest Virtual Time Controls =over 4 diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c index 9123585..decd7a8 100644 --- a/tools/libxl/libxl_create.c +++ b/tools/libxl/libxl_create.c @@ -27,6 +27,8 @@ #include +#define VIOMMU_VTD_BASE_ADDR0xfed9ULL + int libxl__domain_create_info_setdefault(libxl__gc *gc, libxl_domain_create_info *c_info) { @@ -59,6 +61,47 @@ void libxl__rdm_setdefault(libxl__gc *gc, libxl_domain_build_info *b_info) LIBXL_RDM_MEM_BOUNDARY_MEMKB_DEFAULT; } +static int libxl__viommu_set_default(libxl__gc *gc, + libxl_domain_build_info *b_info) +{ +int i; + +if (!b_info->num_viommus) +return 0; + +for (i = 0; i < b_info->num_viommus; i++) { +libxl_viommu_info *viommu = &b_info->viommu[i]; + +if (libxl_defbool_is_default(viommu->intremap)) +libxl_defbool_set(&viommu->intremap, true); + +if (!libxl_defbool_val(viommu->intremap)) { +LOGE(ERROR, "Cannot create one virtual VTD without intremap"); +return ERROR_INVAL; +} + +if (viommu->type == LIBXL_VIOMMU_TYPE_INTEL_VTD) { +/* + * If there are multiple vIOMMUs, we need arrange all vIOMMUs to + * avoid overlap. Put a check here in case we get here for multiple + * vIOMMUs case. + */ +if (b_info->num_viommus > 1) { +LOGE(ERROR, "Multiple vIOMMUs support is under implementation"); +return ERROR_INVAL; +} + +/* Set default values to unexposed fields */ +viommu->base_addr = VIOMMU_VTD_BASE_ADDR; + +/* Set desired capbilities */ +viommu->cap = VIOMMU_CAP_IRQ_REMAPPING; +} +} + +return 0; +} + int libxl__domain_build_info_setdefault(libxl__gc *gc, libxl_domain_build_info *b_info) { @@ -214,6 +257,9 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc, libxl__arch_domain_build_info_acpi_setdefault(b_info); +if (libxl__viommu_set_default(gc, b_info)) +return ERROR_FAIL; + switch (b_info->type) { case LIBXL_DOMAIN_TYPE_HVM: if (b_info->shadow_memkb == LIBXL_MEMKB_DEFAULT) @@ -890,6 +936,12 @@ static void initiate_domain_create(libxl__egc *egc, goto error_out; } +if (d_config->b_info.num_viommus > 1) { +ret = ERROR_INVAL; +LOGD(ERROR, domid, "Cannot support multiple vIOMMUs"); +goto error_out; +} + ret = libxl__domain_create_info_setdefault(gc, &d_config->c_info); if (ret) { LOGD(ERROR, domid, "Unable to set domain create info defaults"); diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl index 173d70a..286c960 100644 --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -450,6 +450,17 @@ libxl_altp2m_mode = Enumeration("altp2m_mode", [ (3, "limited"), ], init_val = "LIBXL_ALTP2M_MODE_DISABLED") +libxl_viommu_type = Enumeration("viommu_type", [ +(1, "intel_vtd"), +]) + +libxl_viommu_info = Struct("viommu_info", [ +("type"
[Xen-devel] [PATCH V3 00/29]
Change since v2: 1) Remove vIOMMU hypercall of query capabilities and introduce when necessary. 2) Remove length field of vIOMMU create parameter of vIOMMU hypercall 3) Introduce irq remapping mode callback to vIOMMU framework and vIOMMU device models can check irq remapping mode by vendor specific ways. 4) Update vIOMMU docs. 5) Other changes please see patches' change logs. Change since v1: 1) Fix coding style issues 2) Add definitions for vIOMMU type and capabilities 3) Change vIOMMU kconfig and select vIOMMU default on x86 4) Put vIOMMU creation in libxl__arch_domain_create() 5) Make vIOMMU structure of tool stack more general for both PV and HVM. Change since RFC v2: 1) Move vvtd.c to drivers/passthrough/vtd directroy. 2) Make vIOMMU always built in on x86 3) Add new boot cmd "viommu" to enable viommu function 4) Fix some code stype issues. Change since RFC v1: 1) Add Xen virtual IOMMU doc docs/misc/viommu.txt 2) Move vIOMMU hypercall of create/destroy vIOMMU and query capabilities from dmop to domctl suggested by Paul Durrant. Because these hypercalls can be done in tool stack and more VM mode(E,G PVH or other modes don't use Qemu) can be benefit. 3) Add check of input MMIO address and length. 4) Add iommu_type in vIOMMU hypercall parameter to specify vendor vIOMMU device model(E,G Intel VTD, AMD or ARM IOMMU. So far only support Intel VTD). 5) Add save and restore support for vvtd This patchset is to introduce vIOMMU framework and add virtual VTD's interrupt remapping support according "Xen virtual IOMMU high level design doc V3"(https://lists.xenproject.org/archives/html/xen-devel/ 2016-11/msg01391.html). - vIOMMU framework New framework provides viommu_ops and help functions to abstract vIOMMU operations(E,G create, destroy, handle irq remapping request and so on). Vendors(Intel, ARM, AMD and son) can implement their vIOMMU callbacks. - Virtual VTD We enable irq remapping function and covers both MSI and IOAPIC interrupts. Don't support post interrupt mode emulation and post interrupt mode enabled on host with virtual VTD. will add later. Repo: https://github.com/lantianyu/Xen/tree/xen_viommu_v3 Chao Gao (23): tools/libacpi: Add DMA remapping reporting (DMAR) ACPI table structures tools/libacpi: Add new fields in acpi_config for DMAR table tools/libxl: Add a user configurable parameter to control vIOMMU attributes tools/libxl: build DMAR table for a guest with one virtual VTD tools/libxl: create vIOMMU during domain construction tools/libxc: Add viommu operations in libxc vtd: add and align register definitions x86/hvm: Introduce a emulated VTD for HVM x86/vvtd: Add MMIO handler for VVTD x86/vvtd: Set Interrupt Remapping Table Pointer through GCMD x86/vvtd: Enable Interrupt Remapping through GCMD x86/vvtd: Process interrupt remapping request x86/vvtd: decode interrupt attribute from IRTE x86/vvtd: add a helper function to decide the interrupt format x86/vioapic: Hook interrupt delivery of vIOAPIC x86/vioapic: extend vioapic_get_vector() to support remapping format RTE passthrough: move some fields of hvm_gmsi_info to a sub-structure tools/libxc: Add a new interface to bind remapping format msi with pirq x86/vmsi: Hook delivering remapping format msi to guest x86/vvtd: Handle interrupt translation faults x86/vvtd: Enable Queued Invalidation through GCMD x86/vvtd: Add queued invalidation (QI) support x86/vvtd: save and restore emulated VT-d Lan Tianyu (6): Xen/doc: Add Xen virtual IOMMU doc VIOMMU: Add vIOMMU helper functions to create, destroy vIOMMU instance DOMCTL: Introduce new DOMCTL commands for vIOMMU support VIOMMU: Add irq request callback to deal with irq remapping VIOMMU: Add get irq info callback to convert irq remapping request VIOMMU: Introduce callback of checking irq remapping mode docs/man/xl.cfg.pod.5.in | 27 + docs/misc/viommu.txt | 136 docs/misc/xen-command-line.markdown|7 + tools/libacpi/acpi2_0.h| 61 ++ tools/libacpi/build.c | 53 ++ tools/libacpi/libacpi.h| 12 + tools/libxc/Makefile |1 + tools/libxc/include/xenctrl.h | 21 + tools/libxc/xc_domain.c| 53 ++ tools/libxc/xc_viommu.c| 64 ++ tools/libxl/libxl_create.c | 52 ++ tools/libxl/libxl_types.idl| 12 + tools/libxl/libxl_x86.c| 20 +- tools/libxl/libxl_x86_acpi.c | 98 ++- tools/xl/xl_parse.c| 52 +- xen/arch/x86/Kconfig |1 + xen/arch/x86/hvm/irq.c |7 + xen/arch/x86/hvm/vioapic.c | 26 +- xen/arch/x86/hvm/vmsi.c| 18 +- xen/common/K
[Xen-devel] [PATCH V3 4/29] tools/libacpi: Add DMA remapping reporting (DMAR) ACPI table structures
From: Chao Gao Add dmar table structure according Chapter 8 "BIOS Considerations" of VTd spec Rev. 2.4. VTd spec:http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- tools/libacpi/acpi2_0.h | 61 + 1 file changed, 61 insertions(+) diff --git a/tools/libacpi/acpi2_0.h b/tools/libacpi/acpi2_0.h index 2619ba3..758a823 100644 --- a/tools/libacpi/acpi2_0.h +++ b/tools/libacpi/acpi2_0.h @@ -422,6 +422,65 @@ struct acpi_20_slit { }; /* + * DMA Remapping Table header definition (DMAR) + */ + +/* + * DMAR Flags. + */ +#define ACPI_DMAR_INTR_REMAP(1 << 0) +#define ACPI_DMAR_X2APIC_OPT_OUT(1 << 1) + +struct acpi_dmar { +struct acpi_header header; +uint8_t host_address_width; +uint8_t flags; +uint8_t reserved[10]; +}; + +/* + * Device Scope Types + */ +#define ACPI_DMAR_DEVICE_SCOPE_PCI_ENDPOINT 0x01 +#define ACPI_DMAR_DEVICE_SCOPE_PCI_SUB_HIERARACHY 0x01 +#define ACPI_DMAR_DEVICE_SCOPE_IOAPIC 0x03 +#define ACPI_DMAR_DEVICE_SCOPE_HPET 0x04 +#define ACPI_DMAR_DEVICE_SCOPE_ACPI_NAMESPACE_DEVICE0x05 + +struct dmar_device_scope { +uint8_t type; +uint8_t length; +uint8_t reserved[2]; +uint8_t enumeration_id; +uint8_t bus; +uint16_t path[0]; +}; + +/* + * DMA Remapping Hardware Unit Types + */ +#define ACPI_DMAR_TYPE_HARDWARE_UNIT0x00 +#define ACPI_DMAR_TYPE_RESERVED_MEMORY 0x01 +#define ACPI_DMAR_TYPE_ATSR 0x02 +#define ACPI_DMAR_TYPE_HARDWARE_AFFINITY0x03 +#define ACPI_DMAR_TYPE_ANDD 0x04 + +/* + * DMA Remapping Hardware Unit Flags. All other bits are reserved and must be 0. + */ +#define ACPI_DMAR_INCLUDE_PCI_ALL (1 << 0) + +struct acpi_dmar_hardware_unit { +uint16_t type; +uint16_t length; +uint8_t flags; +uint8_t reserved; +uint16_t pci_segment; +uint64_t base_address; +struct dmar_device_scope scope[0]; +}; + +/* * Table Signatures. */ #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ') @@ -435,6 +494,7 @@ struct acpi_20_slit { #define ACPI_2_0_WAET_SIGNATURE ASCII32('W','A','E','T') #define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T') #define ACPI_2_0_SLIT_SIGNATURE ASCII32('S','L','I','T') +#define ACPI_2_0_DMAR_SIGNATURE ASCII32('D','M','A','R') /* * Table revision numbers. @@ -449,6 +509,7 @@ struct acpi_20_slit { #define ACPI_1_0_FADT_REVISION 0x01 #define ACPI_2_0_SRAT_REVISION 0x01 #define ACPI_2_0_SLIT_REVISION 0x01 +#define ACPI_2_0_DMAR_REVISION 0x01 #pragma pack () -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 11/29] x86/hvm: Introduce a emulated VTD for HVM
From: Chao Gao This patch adds create/destroy function for the emulated VTD and adapts it to the common VIOMMU abstraction. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- xen/drivers/passthrough/vtd/Makefile | 7 +- xen/drivers/passthrough/vtd/iommu.h | 23 +- xen/drivers/passthrough/vtd/vvtd.c | 147 +++ 3 files changed, 170 insertions(+), 7 deletions(-) create mode 100644 xen/drivers/passthrough/vtd/vvtd.c diff --git a/xen/drivers/passthrough/vtd/Makefile b/xen/drivers/passthrough/vtd/Makefile index f302653..163c7fe 100644 --- a/xen/drivers/passthrough/vtd/Makefile +++ b/xen/drivers/passthrough/vtd/Makefile @@ -1,8 +1,9 @@ subdir-$(CONFIG_X86) += x86 -obj-y += iommu.o obj-y += dmar.o -obj-y += utils.o -obj-y += qinval.o obj-y += intremap.o +obj-y += iommu.o +obj-y += qinval.o obj-y += quirks.o +obj-y += utils.o +obj-$(CONFIG_VIOMMU) += vvtd.o diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h index d7e433e..ef038c9 100644 --- a/xen/drivers/passthrough/vtd/iommu.h +++ b/xen/drivers/passthrough/vtd/iommu.h @@ -66,6 +66,12 @@ #define VER_MAJOR(v)(((v) & 0xf0) >> 4) #define VER_MINOR(v)((v) & 0x0f) +/* Supported Adjusted Guest Address Widths */ +#define DMA_CAP_SAGAW_SHIFT 8 + /* 39-bit AGAW, 3-level page-table */ +#define DMA_CAP_SAGAW_39bit (0x2ULL << DMA_CAP_SAGAW_SHIFT) +#define DMA_CAP_ND_64K 6ULL + /* * Decoding Capability Register */ @@ -74,6 +80,7 @@ #define cap_write_drain(c) (((c) >> 54) & 1) #define cap_max_amask_val(c) (((c) >> 48) & 0x3f) #define cap_num_fault_regs(c) c) >> 40) & 0xff) + 1) +#define cap_set_num_fault_regs(c) c) - 1) & 0xff) << 40) #define cap_pgsel_inv(c) (((c) >> 39) & 1) #define cap_super_page_val(c) (((c) >> 34) & 0xf) @@ -85,11 +92,13 @@ #define cap_sps_1tb(c) ((c >> 37) & 1) #define cap_fault_reg_offset(c)c) >> 24) & 0x3ff) * 16) +#define cap_set_fault_reg_offset(c) c) / 16) & 0x3ff) << 24 ) #define cap_isoch(c)(((c) >> 23) & 1) #define cap_qos(c)(((c) >> 22) & 1) #define cap_mgaw(c)c) >> 16) & 0x3f) + 1) -#define cap_sagaw(c)(((c) >> 8) & 0x1f) +#define cap_set_mgaw(c) c) - 1) & 0x3f) << 16) +#define cap_sagaw(c)(((c) >> DMA_CAP_SAGAW_SHIFT) & 0x1f) #define cap_caching_mode(c)(((c) >> 7) & 1) #define cap_phmr(c)(((c) >> 6) & 1) #define cap_plmr(c)(((c) >> 5) & 1) @@ -104,10 +113,16 @@ #define ecap_niotlb_iunits(e)e) >> 24) & 0xff) + 1) #define ecap_iotlb_offset(e) e) >> 8) & 0x3ff) * 16) #define ecap_coherent(e) ((e >> 0) & 0x1) -#define ecap_queued_inval(e) ((e >> 1) & 0x1) +#define DMA_ECAP_QI_SHIFT1 +#define DMA_ECAP_QI (1ULL << DMA_ECAP_QI_SHIFT) +#define ecap_queued_inval(e) ((e >> DMA_ECAP_QI_SHIFT) & 0x1) #define ecap_dev_iotlb(e)((e >> 2) & 0x1) -#define ecap_intr_remap(e) ((e >> 3) & 0x1) -#define ecap_eim(e) ((e >> 4) & 0x1) +#define DMA_ECAP_IR_SHIFT3 +#define DMA_ECAP_IR (1ULL << DMA_ECAP_IR_SHIFT) +#define ecap_intr_remap(e) ((e >> DMA_ECAP_IR_SHIFT) & 0x1) +#define DMA_ECAP_EIM_SHIFT 4 +#define DMA_ECAP_EIM (1ULL << DMA_ECAP_EIM_SHIFT) +#define ecap_eim(e) ((e >> DMA_ECAP_EIM_SHIFT) & 0x1) #define ecap_cache_hints(e) ((e >> 5) & 0x1) #define ecap_pass_thru(e)((e >> 6) & 0x1) #define ecap_snp_ctl(e) ((e >> 7) & 0x1) diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c new file mode 100644 index 000..c851ec7 --- /dev/null +++ b/xen/drivers/passthrough/vtd/vvtd.c @@ -0,0 +1,147 @@ +/* + * vvtd.c + * + * virtualize VTD for HVM. + * + * Copyright (C) 2017 Chao Gao, Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms and conditions of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iommu.h" + +/* Supported ca
[Xen-devel] [PATCH V3 8/29] tools/libxl: create vIOMMU during domain construction
From: Chao Gao If guest is configured to have a vIOMMU, create it during domain construction. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- v3: - Remove the process of querying capabilities. --- tools/libxl/libxl_x86.c | 17 + 1 file changed, 17 insertions(+) diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c index 23c9a55..25cae5f 100644 --- a/tools/libxl/libxl_x86.c +++ b/tools/libxl/libxl_x86.c @@ -341,8 +341,25 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config, if (d_config->b_info.type == LIBXL_DOMAIN_TYPE_HVM) { unsigned long shadow = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); +int i; + xc_shadow_control(ctx->xch, domid, XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, NULL, 0, &shadow, 0, NULL); + +for (i = 0; i < d_config->b_info.num_viommus; i++) { +uint32_t id; +libxl_viommu_info *viommu = d_config->b_info.viommu + i; + +if (viommu->type == LIBXL_VIOMMU_TYPE_INTEL_VTD) { +ret = xc_viommu_create(ctx->xch, domid, VIOMMU_TYPE_INTEL_VTD, + viommu->base_addr, viommu->cap, &id); +if (ret) { +LOGED(ERROR, domid, "create vIOMMU fail"); +ret = ERROR_FAIL; +goto out; +} +} +} } if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_PV && -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 17/29] x86/vvtd: add a helper function to decide the interrupt format
From: Chao Gao Different platform may use different method to distinguish remapping format interrupt and normal format interrupt. Intel uses one bit in IOAPIC RTE or MSI address register to indicate the interrupt is remapping format. vvtd will handle all the interrupts when .check_irq_remapping() return true. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- xen/drivers/passthrough/vtd/vvtd.c | 25 - 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c index 5e22ace..bd1cadd 100644 --- a/xen/drivers/passthrough/vtd/vvtd.c +++ b/xen/drivers/passthrough/vtd/vvtd.c @@ -536,6 +536,28 @@ static int vvtd_get_irq_info(struct domain *d, return 0; } +/* Probe whether the interrupt request is an remapping format */ +static bool vvtd_is_remapping(struct domain *d, + struct arch_irq_remapping_request *irq) +{ +if ( irq->type == VIOMMU_REQUEST_IRQ_APIC ) +{ +struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte }; + +return rte.format; +} +else if ( irq->type == VIOMMU_REQUEST_IRQ_MSI ) +{ +struct msi_msg_remap_entry msi_msg = +{ .address_lo = { .val = irq->msg.msi.addr } }; + +return msi_msg.address_lo.format; +} +ASSERT_UNREACHABLE(); + +return 0; +} + static void vvtd_reset(struct vvtd *vvtd, uint64_t capability) { uint64_t cap = cap_set_num_fault_regs(1ULL) | @@ -607,7 +629,8 @@ struct viommu_ops vvtd_hvm_vmx_ops = { .create = vvtd_create, .destroy = vvtd_destroy, .handle_irq_request = vvtd_handle_irq_request, -.get_irq_info = vvtd_get_irq_info +.get_irq_info = vvtd_get_irq_info, +.check_irq_remapping = vvtd_is_remapping }; static int vvtd_register(void) -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 13/29] x86/vvtd: Set Interrupt Remapping Table Pointer through GCMD
From: Chao Gao Software sets this field to set/update the interrupt remapping table pointer used by hardware. The interrupt remapping table pointer is specified through the Interrupt Remapping Table Address (IRTA_REG) register. This patch emulates this operation and adds some new fields in VVTD to track info (e.g. the table's gfn and max supported entries) of interrupt remapping table. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- v3: - ignore unaligned r/w of vt-d hardware registers and return X86EMUL_OK --- xen/drivers/passthrough/vtd/iommu.h | 12 ++- xen/drivers/passthrough/vtd/vvtd.c | 69 + 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h index ef038c9..a0d5ec8 100644 --- a/xen/drivers/passthrough/vtd/iommu.h +++ b/xen/drivers/passthrough/vtd/iommu.h @@ -153,6 +153,8 @@ #define DMA_GCMD_IRE(((u64)1) << 25) #define DMA_GCMD_SIRTP (((u64)1) << 24) #define DMA_GCMD_CFI(((u64)1) << 23) +/* mask of one-shot bits */ +#define DMA_GCMD_ONE_SHOT_MASK 0x96ff /* GSTS_REG */ #define DMA_GSTS_TES(((u64)1) << 31) @@ -162,9 +164,17 @@ #define DMA_GSTS_WBFS (((u64)1) << 27) #define DMA_GSTS_QIES (((u64)1) <<26) #define DMA_GSTS_IRES (((u64)1) <<25) -#define DMA_GSTS_SIRTPS (((u64)1) << 24) +#define DMA_GSTS_SIRTPS_SHIFT 24 +#define DMA_GSTS_SIRTPS (((u64)1) << DMA_GSTS_SIRTPS_SHIFT) #define DMA_GSTS_CFIS (((u64)1) <<23) +/* IRTA_REG */ +/* The base of 4KB aligned interrupt remapping table */ +#define DMA_IRTA_ADDR(val) ((val) & ~0xfffULL) +/* The size of remapping table is 2^(x+1), where x is the size field in IRTA */ +#define DMA_IRTA_S(val) (val & 0xf) +#define DMA_IRTA_SIZE(val) (1UL << (DMA_IRTA_S(val) + 1)) + /* PMEN_REG */ #define DMA_PMEN_EPM(((u32)1) << 31) #define DMA_PMEN_PRS(((u32)1) << 0) diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c index a3002c3..6736956 100644 --- a/xen/drivers/passthrough/vtd/vvtd.c +++ b/xen/drivers/passthrough/vtd/vvtd.c @@ -32,6 +32,13 @@ /* Supported capabilities by vvtd */ unsigned int vvtd_caps = VIOMMU_CAP_IRQ_REMAPPING; +struct hvm_hw_vvtd_status { +uint32_t eim_enabled : 1; +uint32_t irt_max_entry; +/* Interrupt remapping table base gfn */ +uint64_t irt; +}; + union hvm_hw_vvtd_regs { uint32_t data32[256]; uint64_t data64[128]; @@ -43,6 +50,8 @@ struct vvtd { uint64_t length; /* Point back to the owner domain */ struct domain *domain; + +struct hvm_hw_vvtd_status status; union hvm_hw_vvtd_regs *regs; struct page_info *regs_page; }; @@ -70,6 +79,11 @@ struct vvtd *domain_vvtd(struct domain *d) return (d->viommu) ? d->viommu->priv : NULL; } +static inline void vvtd_set_bit(struct vvtd *vvtd, uint32_t reg, int nr) +{ +__set_bit(nr, &vvtd->regs->data32[reg/sizeof(uint32_t)]); +} + static inline void vvtd_set_reg(struct vvtd *vtd, uint32_t reg, uint32_t value) { vtd->regs->data32[reg/sizeof(uint32_t)] = value; @@ -91,6 +105,44 @@ static inline uint64_t vvtd_get_reg_quad(struct vvtd *vtd, uint32_t reg) return vtd->regs->data64[reg/sizeof(uint64_t)]; } +static void vvtd_handle_gcmd_sirtp(struct vvtd *vvtd, uint32_t val) +{ +uint64_t irta = vvtd_get_reg_quad(vvtd, DMAR_IRTA_REG); + +if ( !(val & DMA_GCMD_SIRTP) ) +return; + +vvtd->status.irt = DMA_IRTA_ADDR(irta) >> PAGE_SHIFT; +vvtd->status.irt_max_entry = DMA_IRTA_SIZE(irta); +vvtd->status.eim_enabled = !!(irta & IRTA_EIME); +vvtd_info("Update IR info (addr=%lx eim=%d size=%d).", + vvtd->status.irt, vvtd->status.eim_enabled, + vvtd->status.irt_max_entry); +vvtd_set_bit(vvtd, DMAR_GSTS_REG, DMA_GSTS_SIRTPS_SHIFT); +} + +static int vvtd_write_gcmd(struct vvtd *vvtd, uint32_t val) +{ +uint32_t orig = vvtd_get_reg(vvtd, DMAR_GSTS_REG); +uint32_t changed; + +orig = orig & DMA_GCMD_ONE_SHOT_MASK; /* reset the one-shot bits */ +changed = orig ^ val; + +if ( !changed ) +return X86EMUL_OKAY; + +if ( changed & (changed - 1) ) +vvtd_info("Guest attempts to write %x to GCMD (current GSTS is %x)," + "it would lead to update multiple fields", + val, orig); + +if ( changed & DMA_GCMD_SIRTP ) +vvtd_handle_gcmd_sirtp(vvtd, val); + +return X86EMUL_OKAY; +} + static int vvtd_in_range(struct vcpu *v, unsigned long addr) { struct vvtd *vvtd = domain_vvtd(v->domain); @@ -135,12 +187,17 @@ static int vvtd_write(struct vcpu *v, unsigned long addr, { switch ( offset ) { +case DMAR_GCMD_REG: +return vvtd_write_gcmd(v
[Xen-devel] [PATCH V3 14/29] x86/vvtd: Enable Interrupt Remapping through GCMD
From: Chao Gao Software writes this field to enable/disable interrupt reampping. This patch emulate IRES field of GCMD. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- xen/drivers/passthrough/vtd/iommu.h | 3 ++- xen/drivers/passthrough/vtd/vvtd.c | 30 +- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h index a0d5ec8..703726f 100644 --- a/xen/drivers/passthrough/vtd/iommu.h +++ b/xen/drivers/passthrough/vtd/iommu.h @@ -163,7 +163,8 @@ #define DMA_GSTS_AFLS (((u64)1) << 28) #define DMA_GSTS_WBFS (((u64)1) << 27) #define DMA_GSTS_QIES (((u64)1) <<26) -#define DMA_GSTS_IRES (((u64)1) <<25) +#define DMA_GSTS_IRES_SHIFT 25 +#define DMA_GSTS_IRES (((u64)1) << DMA_GSTS_IRES_SHIFT) #define DMA_GSTS_SIRTPS_SHIFT 24 #define DMA_GSTS_SIRTPS (((u64)1) << DMA_GSTS_SIRTPS_SHIFT) #define DMA_GSTS_CFIS (((u64)1) <<23) diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c index 6736956..a0f63e9 100644 --- a/xen/drivers/passthrough/vtd/vvtd.c +++ b/xen/drivers/passthrough/vtd/vvtd.c @@ -33,7 +33,8 @@ unsigned int vvtd_caps = VIOMMU_CAP_IRQ_REMAPPING; struct hvm_hw_vvtd_status { -uint32_t eim_enabled : 1; +uint32_t eim_enabled : 1, + intremap_enabled : 1; uint32_t irt_max_entry; /* Interrupt remapping table base gfn */ uint64_t irt; @@ -84,6 +85,11 @@ static inline void vvtd_set_bit(struct vvtd *vvtd, uint32_t reg, int nr) __set_bit(nr, &vvtd->regs->data32[reg/sizeof(uint32_t)]); } +static inline void vvtd_clear_bit(struct vvtd *vvtd, uint32_t reg, int nr) +{ +__clear_bit(nr, &vvtd->regs->data32[reg/sizeof(uint32_t)]); +} + static inline void vvtd_set_reg(struct vvtd *vtd, uint32_t reg, uint32_t value) { vtd->regs->data32[reg/sizeof(uint32_t)] = value; @@ -105,6 +111,23 @@ static inline uint64_t vvtd_get_reg_quad(struct vvtd *vtd, uint32_t reg) return vtd->regs->data64[reg/sizeof(uint64_t)]; } +static void vvtd_handle_gcmd_ire(struct vvtd *vvtd, uint32_t val) +{ +vvtd_info("%sable Interrupt Remapping", + (val & DMA_GCMD_IRE) ? "En" : "Dis"); + +if ( val & DMA_GCMD_IRE ) +{ +vvtd->status.intremap_enabled = true; +vvtd_set_bit(vvtd, DMAR_GSTS_REG, DMA_GSTS_IRES_SHIFT); +} +else +{ +vvtd->status.intremap_enabled = false; +vvtd_clear_bit(vvtd, DMAR_GSTS_REG, DMA_GSTS_IRES_SHIFT); +} +} + static void vvtd_handle_gcmd_sirtp(struct vvtd *vvtd, uint32_t val) { uint64_t irta = vvtd_get_reg_quad(vvtd, DMAR_IRTA_REG); @@ -112,6 +135,9 @@ static void vvtd_handle_gcmd_sirtp(struct vvtd *vvtd, uint32_t val) if ( !(val & DMA_GCMD_SIRTP) ) return; +if ( vvtd->status.intremap_enabled ) +vvtd_info("Update Interrupt Remapping Table when active\n"); + vvtd->status.irt = DMA_IRTA_ADDR(irta) >> PAGE_SHIFT; vvtd->status.irt_max_entry = DMA_IRTA_SIZE(irta); vvtd->status.eim_enabled = !!(irta & IRTA_EIME); @@ -139,6 +165,8 @@ static int vvtd_write_gcmd(struct vvtd *vvtd, uint32_t val) if ( changed & DMA_GCMD_SIRTP ) vvtd_handle_gcmd_sirtp(vvtd, val); +if ( changed & DMA_GCMD_IRE ) +vvtd_handle_gcmd_ire(vvtd, val); return X86EMUL_OKAY; } -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 18/29] VIOMMU: Add irq request callback to deal with irq remapping
This patch is to add irq request callback for platform implementation to deal with irq remapping request. Signed-off-by: Lan Tianyu --- xen/common/viommu.c | 15 + xen/include/asm-x86/viommu.h | 72 xen/include/xen/viommu.h | 11 +++ 3 files changed, 98 insertions(+) create mode 100644 xen/include/asm-x86/viommu.h diff --git a/xen/common/viommu.c b/xen/common/viommu.c index 55feb5d..b517158 100644 --- a/xen/common/viommu.c +++ b/xen/common/viommu.c @@ -163,6 +163,21 @@ int viommu_domctl(struct domain *d, struct xen_domctl_viommu_op *op, return rc; } +int viommu_handle_irq_request(struct domain *d, + struct arch_irq_remapping_request *request) +{ +struct viommu *viommu = d->viommu; + +if ( !viommu ) +return -EINVAL; + +ASSERT(viommu->ops); +if ( !viommu->ops->handle_irq_request ) +return -EINVAL; + +return viommu->ops->handle_irq_request(d, request); +} + /* * Local variables: * mode: C diff --git a/xen/include/asm-x86/viommu.h b/xen/include/asm-x86/viommu.h new file mode 100644 index 000..366fbb6 --- /dev/null +++ b/xen/include/asm-x86/viommu.h @@ -0,0 +1,72 @@ +/* + * include/asm-x86/viommu.h + * + * Copyright (c) 2017 Intel Corporation. + * Author: Lan Tianyu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; If not, see <http://www.gnu.org/licenses/>. + * + */ +#ifndef __ARCH_X86_VIOMMU_H__ +#define __ARCH_X86_VIOMMU_H__ + +/* IRQ request type */ +#define VIOMMU_REQUEST_IRQ_MSI 0 +#define VIOMMU_REQUEST_IRQ_APIC 1 + +struct arch_irq_remapping_request +{ +union { +/* MSI */ +struct { +uint64_t addr; +uint32_t data; +} msi; +/* Redirection Entry in IOAPIC */ +uint64_t rte; +} msg; +uint16_t source_id; +uint8_t type; +}; + +static inline void irq_request_ioapic_fill(struct arch_irq_remapping_request *req, + uint32_t ioapic_id, uint64_t rte) +{ +ASSERT(req); +req->type = VIOMMU_REQUEST_IRQ_APIC; +req->source_id = ioapic_id; +req->msg.rte = rte; +} + +static inline void irq_request_msi_fill(struct arch_irq_remapping_request *req, +uint32_t source_id, uint64_t addr, +uint32_t data) +{ +ASSERT(req); +req->type = VIOMMU_REQUEST_IRQ_MSI; +req->source_id = source_id; +req->msg.msi.addr = addr; +req->msg.msi.data = data; +} + +#endif /* __ARCH_X86_VIOMMU_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/include/xen/viommu.h b/xen/include/xen/viommu.h index baa8ab7..230f6b1 100644 --- a/xen/include/xen/viommu.h +++ b/xen/include/xen/viommu.h @@ -21,10 +21,13 @@ #define __XEN_VIOMMU_H__ struct viommu; +struct arch_irq_remapping_request; struct viommu_ops { int (*create)(struct domain *d, struct viommu *viommu); int (*destroy)(struct viommu *viommu); +int (*handle_irq_request)(struct domain *d, + struct arch_irq_remapping_request *request); }; struct viommu { @@ -45,11 +48,19 @@ int viommu_register_type(uint64_t type, struct viommu_ops *ops); int viommu_destroy_domain(struct domain *d); int viommu_domctl(struct domain *d, struct xen_domctl_viommu_op *op, bool_t *need_copy); +int viommu_handle_irq_request(struct domain *d, + struct arch_irq_remapping_request *request); #else static inline int viommu_register_type(uint64_t type, struct viommu_ops *ops) { return -EINVAL; } +static inline int +viommu_handle_irq_request(struct domain *d, + struct arch_irq_remapping_request *request) +{ +return -EINVAL; +} #endif #endif /* __XEN_VIOMMU_H__ */ -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 9/29] tools/libxc: Add viommu operations in libxc
From: Chao Gao This patch adds XEN_DOMCTL_viommu_op hypercall. This hypercall comprises two sub-commands: - create(): create a vIOMMU in Xen, given viommu type, register-set location and capabilities - destroy(): destroy a vIOMMU specified by viommu_id Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- v3: - Remove API for querying viommu capabilities - Remove pointless cast - Polish commit message - Coding style --- tools/libxc/Makefile | 1 + tools/libxc/include/xenctrl.h | 4 +++ tools/libxc/xc_viommu.c | 64 +++ 3 files changed, 69 insertions(+) create mode 100644 tools/libxc/xc_viommu.c diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile index 9a019e8..7d8c4b4 100644 --- a/tools/libxc/Makefile +++ b/tools/libxc/Makefile @@ -51,6 +51,7 @@ CTRL_SRCS-$(CONFIG_MiniOS) += xc_minios.c CTRL_SRCS-y += xc_evtchn_compat.c CTRL_SRCS-y += xc_gnttab_compat.c CTRL_SRCS-y += xc_devicemodel_compat.c +CTRL_SRCS-y += xc_viommu.c GUEST_SRCS-y := GUEST_SRCS-y += xg_private.c xc_suspend.c diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h index 43151cb..bedca1f 100644 --- a/tools/libxc/include/xenctrl.h +++ b/tools/libxc/include/xenctrl.h @@ -2501,6 +2501,10 @@ enum xc_static_cpu_featuremask { const uint32_t *xc_get_static_cpu_featuremask(enum xc_static_cpu_featuremask); const uint32_t *xc_get_feature_deep_deps(uint32_t feature); +int xc_viommu_create(xc_interface *xch, domid_t dom, uint64_t type, + uint64_t base_addr, uint64_t cap, uint32_t *viommu_id); +int xc_viommu_destroy(xc_interface *xch, domid_t dom, uint32_t viommu_id); + #endif int xc_livepatch_upload(xc_interface *xch, diff --git a/tools/libxc/xc_viommu.c b/tools/libxc/xc_viommu.c new file mode 100644 index 000..17507c5 --- /dev/null +++ b/tools/libxc/xc_viommu.c @@ -0,0 +1,64 @@ +/* + * xc_viommu.c + * + * viommu related API functions. + * + * Copyright (C) 2017 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License, version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; If not, see <http://www.gnu.org/licenses/>. + */ + +#include "xc_private.h" + +int xc_viommu_create(xc_interface *xch, domid_t dom, uint64_t type, + uint64_t base_addr, uint64_t cap, uint32_t *viommu_id) +{ +int rc; + +DECLARE_DOMCTL; + +domctl.cmd = XEN_DOMCTL_viommu_op; +domctl.domain = dom; +domctl.u.viommu_op.cmd = XEN_DOMCTL_create_viommu; +domctl.u.viommu_op.u.create.viommu_type = type; +domctl.u.viommu_op.u.create.base_address = base_addr; +domctl.u.viommu_op.u.create.capabilities = cap; + +rc = do_domctl(xch, &domctl); +if ( !rc ) +*viommu_id = domctl.u.viommu_op.u.create.viommu_id; + +return rc; +} + +int xc_viommu_destroy(xc_interface *xch, domid_t dom, uint32_t viommu_id) +{ +DECLARE_DOMCTL; + +domctl.cmd = XEN_DOMCTL_viommu_op; +domctl.domain = dom; +domctl.u.viommu_op.cmd = XEN_DOMCTL_destroy_viommu; +domctl.u.viommu_op.u.destroy.viommu_id = viommu_id; + +return do_domctl(xch, &domctl); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 23/29] passthrough: move some fields of hvm_gmsi_info to a sub-structure
From: Chao Gao No functional change. It is a preparation for introducing new fields in hvm_gmsi_info to manage remapping format msi bound to a physical msi. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- xen/arch/x86/hvm/vmsi.c | 4 ++-- xen/drivers/passthrough/io.c | 34 ++ xen/include/asm-x86/hvm/irq.h | 8 ++-- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c index 9b35e9b..7f21853 100644 --- a/xen/arch/x86/hvm/vmsi.c +++ b/xen/arch/x86/hvm/vmsi.c @@ -101,8 +101,8 @@ int vmsi_deliver( void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *pirq_dpci) { -uint32_t flags = pirq_dpci->gmsi.gflags; -int vector = pirq_dpci->gmsi.gvec; +uint32_t flags = pirq_dpci->gmsi.legacy.gflags; +int vector = pirq_dpci->gmsi.legacy.gvec; uint8_t dest = (uint8_t)flags; bool dest_mode = flags & XEN_DOMCTL_VMSI_X86_DM_MASK; uint8_t delivery_mode = MASK_EXTR(flags, XEN_DOMCTL_VMSI_X86_DELIV_MASK); diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c index ec9f41a..fb44223 100644 --- a/xen/drivers/passthrough/io.c +++ b/xen/drivers/passthrough/io.c @@ -350,8 +350,8 @@ int pt_irq_create_bind( { pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI; -pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec; -pirq_dpci->gmsi.gflags = gflags; +pirq_dpci->gmsi.legacy.gvec = pt_irq_bind->u.msi.gvec; +pirq_dpci->gmsi.legacy.gflags = gflags; /* * 'pt_irq_create_bind' can be called after 'pt_irq_destroy_bind'. * The 'pirq_cleanup_check' which would free the structure is only @@ -383,8 +383,8 @@ int pt_irq_create_bind( } if ( unlikely(rc) ) { -pirq_dpci->gmsi.gflags = 0; -pirq_dpci->gmsi.gvec = 0; +pirq_dpci->gmsi.legacy.gflags = 0; +pirq_dpci->gmsi.legacy.gvec = 0; pirq_dpci->dom = NULL; pirq_dpci->flags = 0; pirq_cleanup_check(info, d); @@ -403,21 +403,22 @@ int pt_irq_create_bind( } /* If pirq is already mapped as vmsi, update guest data/addr. */ -if ( pirq_dpci->gmsi.gvec != pt_irq_bind->u.msi.gvec || - pirq_dpci->gmsi.gflags != gflags ) +if ( pirq_dpci->gmsi.legacy.gvec != pt_irq_bind->u.msi.gvec || + pirq_dpci->gmsi.legacy.gflags != gflags ) { /* Directly clear pending EOIs before enabling new MSI info. */ pirq_guest_eoi(info); -pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec; -pirq_dpci->gmsi.gflags = gflags; +} +pirq_dpci->gmsi.legacy.gvec = pt_irq_bind->u.msi.gvec; +pirq_dpci->gmsi.legacy.gflags = gflags; } } /* Calculate dest_vcpu_id for MSI-type pirq migration. */ -dest = MASK_EXTR(pirq_dpci->gmsi.gflags, +dest = MASK_EXTR(pirq_dpci->gmsi.legacy.gflags, XEN_DOMCTL_VMSI_X86_DEST_ID_MASK); -dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK; -delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags, +dest_mode = pirq_dpci->gmsi.legacy.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK; +delivery_mode = MASK_EXTR(pirq_dpci->gmsi.legacy.gflags, XEN_DOMCTL_VMSI_X86_DELIV_MASK); dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode); @@ -430,7 +431,7 @@ int pt_irq_create_bind( { if ( delivery_mode == dest_LowestPrio ) vcpu = vector_hashing_dest(d, dest, dest_mode, - pirq_dpci->gmsi.gvec); + pirq_dpci->gmsi.legacy.gvec); if ( vcpu ) pirq_dpci->gmsi.posted = true; } @@ -440,7 +441,7 @@ int pt_irq_create_bind( /* Use interrupt posting if it is supported. */ if ( iommu_intpost ) pi_update_irte(vcpu ? &vcpu->arch.hvm_vmx.pi_desc : NULL, - info, pirq_dpci->gmsi.gvec); + info, pirq_dpci->gmsi.legacy.gvec); if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED ) { @@ -835,11 +836,12 @@ static int _hvm_dpci_msi_eoi(struct domain *d, int vector = (long)arg; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) && - (pirq_dpci->gmsi.gvec == vector) ) + (pirq_dpci->gmsi.legacy.gvec == vector) ) { -
[Xen-devel] [PATCH V3 5/29] tools/libacpi: Add new fields in acpi_config for DMAR table
From: Chao Gao The BIOS reports the remapping hardware units in a platform to system software through the DMA Remapping Reporting (DMAR) ACPI table. New fields are introduces for DMAR table. These new fields are set by toolstack through parsing guest's config file. construct_dmar() is added to build DMAR table according to the new fields. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- v3: - Remove chip-set specific IOAPIC BDF. Instead, let IOAPIC-related info be passed by struct acpi_config. --- tools/libacpi/build.c | 53 + tools/libacpi/libacpi.h | 12 +++ 2 files changed, 65 insertions(+) diff --git a/tools/libacpi/build.c b/tools/libacpi/build.c index f9881c9..5ee8fcd 100644 --- a/tools/libacpi/build.c +++ b/tools/libacpi/build.c @@ -303,6 +303,59 @@ static struct acpi_20_slit *construct_slit(struct acpi_ctxt *ctxt, return slit; } +/* + * Only one DMA remapping hardware unit is exposed and all devices + * are under the remapping hardware unit. I/O APIC should be explicitly + * enumerated. + */ +struct acpi_dmar *construct_dmar(struct acpi_ctxt *ctxt, + const struct acpi_config *config) +{ +struct acpi_dmar *dmar; +struct acpi_dmar_hardware_unit *drhd; +struct dmar_device_scope *scope; +unsigned int size; +unsigned int ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]); + +size = sizeof(*dmar) + sizeof(*drhd) + ioapic_scope_size; + +dmar = ctxt->mem_ops.alloc(ctxt, size, 16); +if ( !dmar ) +return NULL; + +memset(dmar, 0, size); +dmar->header.signature = ACPI_2_0_DMAR_SIGNATURE; +dmar->header.revision = ACPI_2_0_DMAR_REVISION; +dmar->header.length = size; +fixed_strcpy(dmar->header.oem_id, ACPI_OEM_ID); +fixed_strcpy(dmar->header.oem_table_id, ACPI_OEM_TABLE_ID); +dmar->header.oem_revision = ACPI_OEM_REVISION; +dmar->header.creator_id = ACPI_CREATOR_ID; +dmar->header.creator_revision = ACPI_CREATOR_REVISION; +dmar->host_address_width = config->host_addr_width - 1; +if ( config->iommu_intremap_supported ) +dmar->flags |= ACPI_DMAR_INTR_REMAP; +if ( !config->iommu_x2apic_supported ) +dmar->flags |= ACPI_DMAR_X2APIC_OPT_OUT; + +drhd = (struct acpi_dmar_hardware_unit *)((void*)dmar + sizeof(*dmar)); +drhd->type = ACPI_DMAR_TYPE_HARDWARE_UNIT; +drhd->length = sizeof(*drhd) + ioapic_scope_size; +drhd->flags = ACPI_DMAR_INCLUDE_PCI_ALL; +drhd->pci_segment = 0; +drhd->base_address = config->iommu_base_addr; + +scope = &drhd->scope[0]; +scope->type = ACPI_DMAR_DEVICE_SCOPE_IOAPIC; +scope->length = ioapic_scope_size; +scope->enumeration_id = config->ioapic_id; +scope->bus = config->ioapic_bus; +scope->path[0] = config->ioapic_devfn; + +set_checksum(dmar, offsetof(struct acpi_header, checksum), size); +return dmar; +} + static int construct_passthrough_tables(struct acpi_ctxt *ctxt, unsigned long *table_ptrs, int nr_tables, diff --git a/tools/libacpi/libacpi.h b/tools/libacpi/libacpi.h index a2efd23..fdd6a78 100644 --- a/tools/libacpi/libacpi.h +++ b/tools/libacpi/libacpi.h @@ -20,6 +20,8 @@ #ifndef __LIBACPI_H__ #define __LIBACPI_H__ +#include + #define ACPI_HAS_COM1 (1<<0) #define ACPI_HAS_COM2 (1<<1) #define ACPI_HAS_LPT1 (1<<2) @@ -96,8 +98,18 @@ struct acpi_config { uint32_t ioapic_base_address; uint16_t pci_isa_irq_mask; uint8_t ioapic_id; + +/* Emulated IOMMU features, location and IOAPIC under the scope of IOMMU */ +bool iommu_intremap_supported; +bool iommu_x2apic_supported; +uint8_t host_addr_width; +uint8_t ioapic_bus; +uint16_t ioapic_devfn; +uint64_t iommu_base_addr; }; +struct acpi_dmar *construct_dmar(struct acpi_ctxt *ctxt, + const struct acpi_config *config); int acpi_build_tables(struct acpi_ctxt *ctxt, struct acpi_config *config); #endif /* __LIBACPI_H__ */ -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 12/29] x86/vvtd: Add MMIO handler for VVTD
From: Chao Gao This patch adds VVTD MMIO handler to deal with MMIO access. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- xen/drivers/passthrough/vtd/vvtd.c | 91 ++ 1 file changed, 91 insertions(+) diff --git a/xen/drivers/passthrough/vtd/vvtd.c b/xen/drivers/passthrough/vtd/vvtd.c index c851ec7..a3002c3 100644 --- a/xen/drivers/passthrough/vtd/vvtd.c +++ b/xen/drivers/passthrough/vtd/vvtd.c @@ -47,6 +47,29 @@ struct vvtd { struct page_info *regs_page; }; +/* Setting viommu_verbose enables debugging messages of vIOMMU */ +bool __read_mostly viommu_verbose; +boolean_runtime_param("viommu_verbose", viommu_verbose); + +#ifndef NDEBUG +#define vvtd_info(fmt...) do {\ +if ( viommu_verbose ) \ +gprintk(XENLOG_G_INFO, ## fmt); \ +} while(0) +#define vvtd_debug(fmt...) do { \ +if ( viommu_verbose && printk_ratelimit() ) \ +printk(XENLOG_G_DEBUG fmt); \ +} while(0) +#else +#define vvtd_info(fmt...) do {} while(0) +#define vvtd_debug(fmt...) do {} while(0) +#endif + +struct vvtd *domain_vvtd(struct domain *d) +{ +return (d->viommu) ? d->viommu->priv : NULL; +} + static inline void vvtd_set_reg(struct vvtd *vtd, uint32_t reg, uint32_t value) { vtd->regs->data32[reg/sizeof(uint32_t)] = value; @@ -68,6 +91,73 @@ static inline uint64_t vvtd_get_reg_quad(struct vvtd *vtd, uint32_t reg) return vtd->regs->data64[reg/sizeof(uint64_t)]; } +static int vvtd_in_range(struct vcpu *v, unsigned long addr) +{ +struct vvtd *vvtd = domain_vvtd(v->domain); + +if ( vvtd ) +return (addr >= vvtd->base_addr) && + (addr < vvtd->base_addr + PAGE_SIZE); +return 0; +} + +static int vvtd_read(struct vcpu *v, unsigned long addr, + unsigned int len, unsigned long *pval) +{ +struct vvtd *vvtd = domain_vvtd(v->domain); +unsigned int offset = addr - vvtd->base_addr; + +vvtd_info("Read offset %x len %d\n", offset, len); + +if ( (len != 4 && len != 8) || (offset & (len - 1)) ) +return X86EMUL_OKAY; + +if ( len == 4 ) +*pval = vvtd_get_reg(vvtd, offset); +else +*pval = vvtd_get_reg_quad(vvtd, offset); + +return X86EMUL_OKAY; +} + +static int vvtd_write(struct vcpu *v, unsigned long addr, + unsigned int len, unsigned long val) +{ +struct vvtd *vvtd = domain_vvtd(v->domain); +unsigned int offset = addr - vvtd->base_addr; + +vvtd_info("Write offset %x len %d val %lx\n", offset, len, val); + +if ( (len != 4 && len != 8) || (offset & (len - 1)) ) +return X86EMUL_OKAY; + +if ( len == 4 ) +{ +switch ( offset ) +{ +case DMAR_IEDATA_REG: +case DMAR_IEADDR_REG: +case DMAR_IEUADDR_REG: +case DMAR_FEDATA_REG: +case DMAR_FEADDR_REG: +case DMAR_FEUADDR_REG: +vvtd_set_reg(vvtd, offset, val); +break; + +default: +break; +} +} + +return X86EMUL_OKAY; +} + +static const struct hvm_mmio_ops vvtd_mmio_ops = { +.check = vvtd_in_range, +.read = vvtd_read, +.write = vvtd_write +}; + static void vvtd_reset(struct vvtd *vvtd, uint64_t capability) { uint64_t cap = cap_set_num_fault_regs(1ULL) | @@ -109,6 +199,7 @@ static int vvtd_create(struct domain *d, struct viommu *viommu) vvtd_reset(vvtd, viommu->caps); vvtd->base_addr = viommu->base_address; vvtd->domain = d; +register_mmio_handler(d, &vvtd_mmio_ops); viommu->priv = vvtd; -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel
[Xen-devel] [PATCH V3 10/29] vtd: add and align register definitions
From: Chao Gao No functional changes. Signed-off-by: Chao Gao Signed-off-by: Lan Tianyu --- xen/drivers/passthrough/vtd/iommu.h | 54 + 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h index 72c1a2e..d7e433e 100644 --- a/xen/drivers/passthrough/vtd/iommu.h +++ b/xen/drivers/passthrough/vtd/iommu.h @@ -23,31 +23,39 @@ #include /* - * Intel IOMMU register specification per version 1.0 public spec. + * Intel IOMMU register specification per version 2.4 public spec. */ -#defineDMAR_VER_REG0x0/* Arch version supported by this IOMMU */ -#defineDMAR_CAP_REG0x8/* Hardware supported capabilities */ -#defineDMAR_ECAP_REG0x10/* Extended capabilities supported */ -#defineDMAR_GCMD_REG0x18/* Global command register */ -#defineDMAR_GSTS_REG0x1c/* Global status register */ -#defineDMAR_RTADDR_REG0x20/* Root entry table */ -#defineDMAR_CCMD_REG0x28/* Context command reg */ -#defineDMAR_FSTS_REG0x34/* Fault Status register */ -#defineDMAR_FECTL_REG0x38/* Fault control register */ -#defineDMAR_FEDATA_REG0x3c/* Fault event interrupt data register */ -#defineDMAR_FEADDR_REG0x40/* Fault event interrupt addr register */ -#defineDMAR_FEUADDR_REG 0x44/* Upper address register */ -#defineDMAR_AFLOG_REG0x58/* Advanced Fault control */ -#defineDMAR_PMEN_REG0x64/* Enable Protected Memory Region */ -#defineDMAR_PLMBASE_REG 0x68/* PMRR Low addr */ -#defineDMAR_PLMLIMIT_REG 0x6c/* PMRR low limit */ -#defineDMAR_PHMBASE_REG 0x70/* pmrr high base addr */ -#defineDMAR_PHMLIMIT_REG 0x78/* pmrr high limit */ -#defineDMAR_IQH_REG0x80/* invalidation queue head */ -#defineDMAR_IQT_REG0x88/* invalidation queue tail */ -#defineDMAR_IQA_REG0x90/* invalidation queue addr */ -#defineDMAR_IRTA_REG 0xB8/* intr remap */ +#define DMAR_VER_REG0x0 /* Arch version supported by this IOMMU */ +#define DMAR_CAP_REG0x8 /* Hardware supported capabilities */ +#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ +#define DMAR_GCMD_REG 0x18 /* Global command register */ +#define DMAR_GSTS_REG 0x1c /* Global status register */ +#define DMAR_RTADDR_REG 0x20 /* Root entry table */ +#define DMAR_CCMD_REG 0x28 /* Context command reg */ +#define DMAR_FSTS_REG 0x34 /* Fault Status register */ +#define DMAR_FECTL_REG 0x38 /* Fault control register */ +#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ +#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ +#define DMAR_FEUADDR_REG0x44 /* Upper address register */ +#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ +#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ +#define DMAR_PLMBASE_REG0x68 /* PMRR Low addr */ +#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ +#define DMAR_PHMBASE_REG0x70 /* pmrr high base addr */ +#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ +#define DMAR_IQH_REG0x80 /* invalidation queue head */ +#define DMAR_IQT_REG0x88 /* invalidation queue tail */ +#define DMAR_IQT_REG_HI 0x8c +#define DMAR_IQA_REG0x90 /* invalidation queue addr */ +#define DMAR_IQA_REG_HI 0x94 +#define DMAR_ICS_REG0x9c /* Invalidation complete status */ +#define DMAR_IECTL_REG 0xa0 /* Invalidation event control */ +#define DMAR_IEDATA_REG 0xa4 /* Invalidation event data */ +#define DMAR_IEADDR_REG 0xa8 /* Invalidation event address */ +#define DMAR_IEUADDR_REG0xac /* Invalidation event address */ +#define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr */ +#define DMAR_IRTA_REG_HI0xbc #define OFFSET_STRIDE(9) #define dmar_readl(dmar, reg) readl((dmar) + (reg)) -- 1.8.3.1 ___ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel