hw/xen: Support HVM_PARAM_CALLBACK_TYPE_PCI_INTX callback

The guest is permitted to specify an arbitrary domain/bus/device/function and INTX pin from which the callback IRQ shall appear to have come. In QEMU we can only easily do this for devices that actually exist, and even that requires us "knowing" that it's a PCMachine in order to find the PCI root bus — although that's OK really because it's always true. We also don't get to get notified of INTX routing changes, because we can't do that as a passive observer; if we try to register a notifier it will overwrite any existing notifier callback on the device. But in practice, guests using PCI_INTX will only ever use pin A on the Xen platform device, and won't swizzle the INTX routing after they set it up. So this is just fine. Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Paul Durrant <paul@xen.org>
2022-12-16 00:03:21 +00:00 · 2022-12-16 00:03:21 +00:00 · 2aff696b10
commit 2aff696b10
parent ddf0fd9ae1
2 changed files with 100 additions and 14 deletions
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@ -28,6 +28,8 @@
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
 #include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/pci.h"
 #include "hw/irq.h"

 #include "xen_evtchn.h"
@ -101,6 +103,7 @@ struct XenEvtchnState {

    uint64_t callback_param;
    bool evtchn_in_kernel;
+    uint32_t callback_gsi;

    QEMUBH *gsi_bh;

@ -217,11 +220,41 @@ static void xen_evtchn_register_types(void)

 type_init(xen_evtchn_register_types)

+static int set_callback_pci_intx(XenEvtchnState *s, uint64_t param)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    uint8_t pin = param & 3;
+    uint8_t devfn = (param >> 8) & 0xff;
+    uint16_t bus = (param >> 16) & 0xffff;
+    uint16_t domain = (param >> 32) & 0xffff;
+    PCIDevice *pdev;
+    PCIINTxRoute r;
+
+    if (domain || !pcms) {
+        return 0;
+    }
+
+    pdev = pci_find_device(pcms->bus, bus, devfn);
+    if (!pdev) {
+        return 0;
+    }
+
+    r = pci_device_route_intx_to_irq(pdev, pin);
+    if (r.mode != PCI_INTX_ENABLED) {
+        return 0;
+    }
+
+    /*
+     * Hm, can we be notified of INTX routing changes? Not without
+     * *owning* the device and being allowed to overwrite its own
+     * ->intx_routing_notifier, AFAICT. So let's not.
+     */
+    return r.irq;
+}
+
 void xen_evtchn_set_callback_level(int level)
 {
    XenEvtchnState *s = xen_evtchn_singleton;
-    uint32_t param;
-
    if (!s) {
        return;
    }
@ -260,18 +293,12 @@ void xen_evtchn_set_callback_level(int level)
        return;
    }

-    param = (uint32_t)s->callback_param;
-
-    switch (s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) {
-    case HVM_PARAM_CALLBACK_TYPE_GSI:
-        if (param < IOAPIC_NUM_PINS) {
-            qemu_set_irq(s->gsis[param], level);
-            if (level) {
-                /* Ensure the vCPU polls for deassertion */
-                kvm_xen_set_callback_asserted();
-            }
+    if (s->callback_gsi && s->callback_gsi < IOAPIC_NUM_PINS) {
+        qemu_set_irq(s->gsis[s->callback_gsi], level);
+        if (level) {
+            /* Ensure the vCPU polls for deassertion */
+            kvm_xen_set_callback_asserted();
        }
-        break;
    }
 }

@ -283,15 +310,22 @@ int xen_evtchn_set_callback_param(uint64_t param)
        .u.vector = 0,
    };
    bool in_kernel = false;
+    uint32_t gsi = 0;
+    int type = param >> CALLBACK_VIA_TYPE_SHIFT;
    int ret;

    if (!s) {
        return -ENOTSUP;
    }

+    /*
+     * We need the BQL because set_callback_pci_intx() may call into PCI code,
+     * and because we may need to manipulate the old and new GSI levels.
+     */
+    assert(qemu_mutex_iothread_locked());
    qemu_mutex_lock(&s->port_lock);

-    switch (param >> CALLBACK_VIA_TYPE_SHIFT) {
+    switch (type) {
    case HVM_PARAM_CALLBACK_TYPE_VECTOR: {
        xa.u.vector = (uint8_t)param,

@ -299,10 +333,17 @@ int xen_evtchn_set_callback_param(uint64_t param)
        if (!ret && kvm_xen_has_cap(EVTCHN_SEND)) {
            in_kernel = true;
        }
+        gsi = 0;
        break;
    }

+    case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
+        gsi = set_callback_pci_intx(s, param);
+        ret = gsi ? 0 : -EINVAL;
+        break;
+
    case HVM_PARAM_CALLBACK_TYPE_GSI:
+        gsi = (uint32_t)param;
        ret = 0;
        break;

@ -320,6 +361,17 @@ int xen_evtchn_set_callback_param(uint64_t param)
        }
        s->callback_param = param;
        s->evtchn_in_kernel = in_kernel;
+
+        if (gsi != s->callback_gsi) {
+            struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
+
+            xen_evtchn_set_callback_level(0);
+            s->callback_gsi = gsi;
+
+            if (gsi && vi && vi->evtchn_upcall_pending) {
+                kvm_xen_inject_vcpu_callback_vector(0, type);
+            }
+        }
    }

    qemu_mutex_unlock(&s->port_lock);
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@ -131,6 +131,38 @@ int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
        return ret;
    }

+    /* If called a second time, don't repeat the rest of the setup. */
+    if (s->xen_caps) {
+        return 0;
+    }
+
+    /*
+     * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
+     * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
+     *
+     * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
+     * such things to be polled at precisely the right time. We *could* do
+     * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
+     * the moment the IRQ is acked, and see if it should be reasserted.
+     *
+     * But the in-kernel irqchip is deprecated, so we're unlikely to add
+     * that support in the kernel. Insist on using the split irqchip mode
+     * instead.
+     *
+     * This leaves us polling for the level going low in QEMU, which lacks
+     * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
+     * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
+     * the device (for which it has to unmap the device and trap access, for
+     * some period after an IRQ!!). In the Xen case, we do it on exit from
+     * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
+     * Which is kind of icky, but less so than the VFIO one. I may fix them
+     * both later...
+     */
+    if (!kvm_kernel_irqchip_split()) {
+        error_report("kvm: Xen support requires kernel-irqchip=split");
+        return -EINVAL;
+    }
+
    s->xen_caps = xen_caps;
    return 0;
 }
@ -684,7 +716,9 @@ static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,

    switch (hp.index) {
    case HVM_PARAM_CALLBACK_IRQ:
+        qemu_mutex_lock_iothread();
        err = xen_evtchn_set_callback_param(hp.value);
+        qemu_mutex_unlock_iothread();
        xen_set_long_mode(exit->u.hcall.longmode);
        break;
    default: