This patch implements update_pi_irte function hook to allow SVM
communicate to IOMMU driver regarding how to set up IRTE for handling
posted interrupt.

In case AVIC is enabled, during vcpu_load/unload, SVM needs to update
IOMMU IRTE with appropriate host physical APIC ID. Also, when
vcpu_blocking/unblocking, SVM needs to update the is-running bit in
the IOMMU IRTE. Both are achieved via calling amd_iommu_update_ga().

However, if GA mode is not enabled for the pass-through device,
IOMMU driver will simply just return when calling amd_iommu_update_ga.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpa...@amd.com>
---
 arch/x86/kvm/svm.c        | 247 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/amd-iommu.h |   1 +
 2 files changed, 227 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c060e05..303007a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -43,6 +43,7 @@
 #include <asm/desc.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -200,6 +201,16 @@ struct vcpu_svm {
        struct page *avic_backing_page;
        u64 *avic_physical_id_cache;
        bool avic_is_running;
+
+       /*
+        * Per-vcpu list of struct amd_ir_data:
+        * This is used mainly to track interrupt remapping table entry (IRTE)
+        * to be updated when the vcpu affinity changes. This avoid the need
+        * for scanning for IRTE and try to match ga_tag in the IOMMU driver
+        * (or using hashtable).
+        */
+       struct list_head ir_list;
+       spinlock_t ir_list_lock;
 };
 
 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK   (0xFF)
@@ -1443,31 +1454,29 @@ free_avic:
        return err;
 }
 
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu(struct kvm_vcpu *vcpu, int cpu, phys_addr_t pa, bool r)
 {
-       u64 entry;
-       int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+       int ret;
+       unsigned long flags;
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct amd_ir_data *ir;
 
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       svm->avic_is_running = is_run;
-
-       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
-       if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
-               return;
-
-       entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+       if (!kvm_arch_has_assigned_device(vcpu->kvm))
+               return 0;
 
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       if (is_run)
-               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       /*
+        * Here, we go through the per-vcpu ir_list to update all existing
+        * interrupt remapping table entry targeting this vcpu.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(ir, &svm->ir_list, node) {
+               ret = amd_iommu_update_ga(cpu, (pa & AVIC_HPA_MASK), r, ir);
+               if (ret)
+                       break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return ret;
 }
 
 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1494,6 +1503,9 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       avic_update_iommu(vcpu, h_physical_id,
+                         page_to_phys(svm->avic_backing_page),
+                         svm->avic_is_running);
 }
 
 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1505,10 +1517,28 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
                return;
 
        entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+               avic_update_iommu(vcpu, -1,
+                                 page_to_phys(svm->avic_backing_page), 0);
+
        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 }
 
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->avic_is_running = is_run;
+       if (is_run)
+               avic_vcpu_load(vcpu, vcpu->cpu);
+       else
+               avic_vcpu_put(vcpu);
+}
+
 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1570,6 +1600,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, 
unsigned int id)
                err = avic_init_backing_page(&svm->vcpu);
                if (err)
                        goto free_page4;
+
+               INIT_LIST_HEAD(&svm->ir_list);
+               spin_lock_init(&svm->ir_list_lock);
        }
 
        /* We initialize this flag to true to make sure that the is_running
@@ -4366,6 +4399,177 @@ static void svm_deliver_avic_intr(struct kvm_vcpu 
*vcpu, int vec)
                kvm_vcpu_wake_up(vcpu);
 }
 
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_ir_data *ir)
+{
+       bool found = false;
+       unsigned long flags;
+       struct amd_ir_data *cur;
+
+       /**
+        * In some cases, the existing irte is updaed and re-set,
+        * so we need to check here if it's already been * added
+        * to the ir_list.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(cur, &svm->ir_list, node) {
+               if (cur != ir)
+                       continue;
+               found = true;
+               break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+
+       if (found)
+               return 0;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_add(&ir->node, &svm->ir_list);
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return 0;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_ir_data *ir)
+{
+       unsigned long flags;
+       struct amd_ir_data *cur;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(cur, &svm->ir_list, node) {
+               if (cur != ir)
+                       continue;
+               list_del(&cur->node);
+               break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                             uint32_t guest_irq, bool set)
+{
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       int idx, ret = -EINVAL;
+
+       if (!kvm_arch_has_assigned_device(kvm) ||
+           !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+
+       pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+                __func__, host_irq, guest_irq, set);
+
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               struct kvm_lapic_irq irq;
+               struct vcpu_data vcpu_info;
+               struct kvm_vcpu *vcpu = NULL;
+               struct vcpu_svm *svm = NULL;
+
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+
+               /**
+                * Note:
+                * The HW cannot support posting multicast/broadcast
+                * interrupts to a vCPU. So, we still use interrupt
+                * remapping for these kind of interrupts.
+                *
+                * For lowest-priority interrupts, we only support
+                * those with single CPU as the destination, e.g. user
+                * configures the interrupts via /proc/irq or uses
+                * irqbalance to make the interrupts single-CPU.
+                */
+               kvm_set_msi_irq(e, &irq);
+               if (kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+                       svm = to_svm(vcpu);
+                       vcpu_info.pi_desc_addr = 
page_to_phys(svm->avic_backing_page);
+                       vcpu_info.vector = irq.vector;
+
+                       pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+                                irq.vector);
+               } else {
+                       set = false;
+
+                       pr_debug("SVM: %s: use legacy intr remap mode for irq 
%u\n",
+                                __func__, irq.vector);
+               }
+
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+                                        vcpu_info.vector,
+                                        vcpu_info.pi_desc_addr, set);
+
+               /**
+                * When AVIC is disabled, we fall-back to setup
+                * IRTE w/ legacy mode
+                */
+               if (set && kvm_vcpu_apicv_active(&svm->vcpu)) {
+                       struct amd_iommu_pi_data pi;
+
+                       /* Try to enable guest_mode in IRTE */
+                       pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+                                                    vcpu->vcpu_id);
+                       pi.is_guest_mode = true;
+                       pi.vcpu_data = &vcpu_info;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * We save the pointer to pi in the struct
+                        * vcpu_svm so that we can reference to them directly
+                        * when we update vcpu scheduling information in IOMMU
+                        * irte.
+                        */
+                       if (!ret && pi.is_guest_mode)
+                               svm_ir_list_add(svm, pi.ir_data);
+               } else {
+                       /* Use legacy mode in IRTE */
+                       struct amd_iommu_pi_data pi;
+
+                       /**
+                        * Here, pi is used to:
+                        * - Tell IOMMU to use legacy mode for this interrupt.
+                        * - Retrieve ga_tag of prior interrupt remapping data.
+                        */
+                       pi.is_guest_mode = false;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * We need to check if the interrupt was previously
+                        * setup with the guest_mode by checking if the ga_tag
+                        * was cached. If so, we need to clean up the per-vcpu
+                        * ir_list.
+                        */
+                       if (!ret && pi.ga_tag) {
+                               struct kvm_vcpu *vcpu = kvm_get_vcpu_by_id(kvm,
+                                               
AVIC_GATAG_TO_VCPUID(pi.ga_tag));
+
+                               if (vcpu)
+                                       svm_ir_list_del(to_svm(vcpu), 
pi.ir_data);
+                       }
+               }
+
+               if (ret < 0) {
+                       pr_err("%s: failed to update PI IRTE\n", __func__);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -5192,6 +5396,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
        .pmu_ops = &amd_pmu_ops,
        .deliver_posted_interrupt = svm_deliver_avic_intr,
+       .update_pi_irte = svm_update_pi_irte,
 };
 
 static int __init svm_init(void)
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 7b2e802..cd10393 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -34,6 +34,7 @@ struct amd_ir_data {
        struct msi_msg msi_entry;
        void *entry;    /* Pointer to union irte or struct irte_ga */
        void *ref;      /* Pointer to the actual irte */
+       struct list_head node;  /* Used by SVM for per-vcpu ir_list */
 };
 
 /*
-- 
1.9.1

Reply via email to