From f6185f98bfc55bddce67aafa7b7d7f469de3ede5 Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 26 May 2026 09:10:56 -0700 Subject: [PATCH 1/7] datadev/emulator: MSI-X/MSI/INTx IRQ cascade + virtual MSI domain datadev: replace the unconditional INTx assignment in DataDev_Probe with a pci_alloc_irq_vectors(... PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_INTX) cascade. The kernel negotiates the best available type; probe fails only if all three are unavailable. The negotiated type is recorded in dev->irqType and logged. pci_free_irq_vectors() is released on the probe error path and in DataDev_Remove (after free_irq, before pci_disable_device). pci_irq_vector()'s signed errno is validated before being stored into the uint32_t dev->irq. emulator: add a software-only virtual PCI-MSI / MSI-X irq domain (virt_msi.{c,h}) and an emu_irq_mode modparam (intx | msi | msix) so the virtual host bridge can advertise any of the three IRQ capabilities, letting CI exercise every datadev cascade branch. MSI hwirqs are recycled, irq_work is synced on teardown, the MSI-X table/PBA live in BAR0, and the Status Capabilities-List bit is gated on irq_mode. CI: tests/test_irq_modes.sh sweeps emu_irq_mode {intx, msi, msix} per CPU cell, verifying the cfgIrqHold/polled dataplane and asserting datadev's probe-time cascade selected the matching branch. The GPU phase runs the legacy single-mode pass. cross-kernel: a #ifndef PCI_IRQ_INTX shim provides the name renamed from PCI_IRQ_LEGACY in 6.11 so the cascade builds on kernel 5.15; the emulator guards pci_msi_create_irq_domain for 6.19+. docs: update the test_irq_modes.sh descriptions in test-applications.rst and ci-pipeline.rst for the new sweep, and add the PCI_IRQ_INTX (6.11) guard to the kernel-compatibility matrix. --- common/driver/data_dev_top.c | 51 +++- common/driver/dma_common.h | 11 + docs/explanation/ci-pipeline.rst | 8 +- docs/explanation/kernel-compatibility.rst | 7 + docs/reference/test-applications.rst | 9 +- emulator/driver/Makefile | 2 +- emulator/driver/src/dma_engine.c | 35 ++- emulator/driver/src/dma_engine.h | 5 +- emulator/driver/src/emu_main.c | 121 ++++++++- emulator/driver/src/virt_msi.c | 315 ++++++++++++++++++++++ emulator/driver/src/virt_msi.h | 119 ++++++++ emulator/driver/src/virt_pci_host.c | 94 ++++++- emulator/driver/src/virt_pci_host.h | 62 ++++- tests/test_irq_modes.sh | 119 +++++++- 14 files changed, 903 insertions(+), 55 deletions(-) create mode 100644 emulator/driver/src/virt_msi.c create mode 100644 emulator/driver/src/virt_msi.h diff --git a/common/driver/data_dev_top.c b/common/driver/data_dev_top.c index bcc82a9d..ac2c0196 100755 --- a/common/driver/data_dev_top.c +++ b/common/driver/data_dev_top.c @@ -39,6 +39,14 @@ #include #endif +// PCI_IRQ_LEGACY was renamed to PCI_IRQ_INTX in Linux 6.11; the old name was +// kept as a deprecated alias for a few releases, then dropped. Provide the new +// spelling on pre-6.11 kernels so the single pci_alloc_irq_vectors() call site +// builds across the CI matrix (Ubuntu 22.04 / kernel 5.15 lacks PCI_IRQ_INTX). +#ifndef PCI_IRQ_INTX +#define PCI_IRQ_INTX PCI_IRQ_LEGACY +#endif + // Init Configuration values static int cfgTxCount = 1024; static int cfgRxCount = 1024; @@ -264,15 +272,39 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { dev->debug = cfgDebug; - // Assign the IRQ number from the pci_dev structure - dev->irq = pcidev->irq; + // Allocate IRQ vectors: try MSI-X first, then MSI, then legacy INTx. + // pci_alloc_irq_vectors() walks the requested types in priority order + // and returns the count of vectors negotiated; pci_irq_vector(pdev, 0) + // returns the right Linux IRQ number regardless of which path won. + ret = pci_alloc_irq_vectors(pcidev, 1, 1, + PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_INTX); + if (ret < 0) { + dev_err(&pcidev->dev, + "%s: Probe: pci_alloc_irq_vectors() = %i\n", MOD_NAME, ret); + probeReturn = ret; + goto err_unmap; + } - // Check that we actually have an IRQ - if (dev->irq == 0) { - dev_err(dev->device, "%s: No IRQ associated with PCI device\n", MOD_NAME); - probeReturn = -EINVAL; + // pci_irq_vector() returns a signed errno on failure; capture it in an + // int and reject < 0 before storing into the uint32_t dev->irq, otherwise + // a negative value would wrap to a bogus IRQ and reach request_irq(). + ret = pci_irq_vector(pcidev, 0); + if (ret < 0) { + dev_err(&pcidev->dev, + "%s: Probe: pci_irq_vector() = %i\n", MOD_NAME, ret); + probeReturn = ret; goto err_unmap; } + dev->irq = ret; + if (pcidev->msix_enabled) dev->irqType = DMA_IRQ_MSIX; + else if (pcidev->msi_enabled) dev->irqType = DMA_IRQ_MSI; + else dev->irqType = DMA_IRQ_INTX; + + dev_info(dev->device, + "Init: Probe: using %s interrupts, irq=%u\n", + dev->irqType == DMA_IRQ_MSIX ? "MSI-X" : + dev->irqType == DMA_IRQ_MSI ? "MSI" : "legacy INTx", + dev->irq); // Set basic device context @@ -347,6 +379,8 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { dev->utilData = NULL; #endif Dma_UnmapReg(dev); // Idempotent: safe even if Dma_MapReg never ran + pci_free_irq_vectors(pcidev); // Releases MSI/MSI-X/INTx allocation; no-op if + // pci_alloc_irq_vectors failed or never ran err_post_en: pci_disable_device(pcidev); // Disable PCI device on failure err_pre_en: @@ -407,9 +441,12 @@ void DataDev_Remove(struct pci_dev *pcidev) { } #endif - // Call common DMA clean function + // Call common DMA clean function (calls free_irq() internally) Dma_Clean(dev); + // Release MSI/MSI-X/INTx vectors (must follow free_irq, precede pci_disable_device) + pci_free_irq_vectors(pcidev); + // Disable device pci_disable_device(pcidev); diff --git a/common/driver/dma_common.h b/common/driver/dma_common.h index b9080f88..830d9946 100755 --- a/common/driver/dma_common.h +++ b/common/driver/dma_common.h @@ -44,6 +44,15 @@ // Maximum number of destination channels #define DMA_MAX_DEST (8*DMA_MASK_SIZE) +// IRQ delivery type selected at probe time. Set by the PCI top-level +// driver (data_dev_top.c) based on what pci_alloc_irq_vectors() actually +// negotiated; consumed for diagnostic logging only -- the shared +// Dma_Init/Dma_Clean paths in dma_common.c do not branch on this. +#define DMA_IRQ_NONE 0 +#define DMA_IRQ_INTX 1 +#define DMA_IRQ_MSI 2 +#define DMA_IRQ_MSIX 3 + // Forward declarations struct hardware_functions; struct DmaDesc; @@ -82,6 +91,7 @@ typedef uint32_t __poll_t; * @utilData: Utility data for driver use. * @debug: Debug flag. * @irq: IRQ number. + * @irqType: IRQ delivery type (DMA_IRQ_NONE/INTX/MSI/MSIX). * @writeHwLock: Spinlock for hardware write operations. * @commandLock: Spinlock for command operations. * @maskLock: Spinlock for destination mask operations. @@ -145,6 +155,7 @@ struct DmaDevice { // IRQ uint32_t irq; + uint8_t irqType; // Locks spinlock_t writeHwLock; diff --git a/docs/explanation/ci-pipeline.rst b/docs/explanation/ci-pipeline.rst index a10f9e96..4fb66c4c 100644 --- a/docs/explanation/ci-pipeline.rst +++ b/docs/explanation/ci-pipeline.rst @@ -378,8 +378,12 @@ The combined Phase 2 + Phase 3 coverage across all distributions: - 3 - Multi-destination (0, 7, 8), cross-contamination check * - IRQ modes - - 3 - - cfgIrqHold=1, cfgIrqHold=100000, polled (cfgIrqDis=1) + - 12 + - ``emu_irq_mode`` {intx, msi, msix} sweep (CPU phase). Each mode + runs cfgIrqHold=1, cfgIrqHold=100000, and polled (cfgIrqDis=1), + plus a probe-cascade assertion that ``datadev`` selected the + matching INTx / MSI / MSI-X path. GPU phase runs the single-mode + legacy pass * - Module lifecycle - 4 - 3 rapid reload cycles, rmmod-under-load, cfgMode=1/2 transitions diff --git a/docs/explanation/kernel-compatibility.rst b/docs/explanation/kernel-compatibility.rst index fe59902a..e6548334 100644 --- a/docs/explanation/kernel-compatibility.rst +++ b/docs/explanation/kernel-compatibility.rst @@ -65,6 +65,13 @@ the source files listed. ``class_create()`` dropped its first argument (``THIS_MODULE``) in 6.4. Red Hat backported this change in RHEL 9.4. +``< KERNEL_VERSION(6, 11, 0)`` — ``data_dev_top.c`` + ``PCI_IRQ_LEGACY`` was renamed to ``PCI_IRQ_INTX`` in 6.11 (the old name was kept as a + deprecated alias for a few releases, then dropped). A ``#ifndef PCI_IRQ_INTX`` guard + defines the new spelling as ``PCI_IRQ_LEGACY`` so the single ``pci_alloc_irq_vectors()`` + call site builds on pre-6.11 kernels (e.g. Ubuntu 22.04 / kernel 5.15, which lacks + ``PCI_IRQ_INTX``). + RHEL Backport Detection ------------------------ diff --git a/docs/reference/test-applications.rst b/docs/reference/test-applications.rst index 984e1f8a..47e5fcf4 100644 --- a/docs/reference/test-applications.rst +++ b/docs/reference/test-applications.rst @@ -267,8 +267,13 @@ by ``scripts/ci/test-cpu.sh`` or ``scripts/ci/test-gpu.sh`` during CI. * - ``test_backpressure.sh`` - Saturates TX buffers then verifies recovery and correct dmesg state * - ``test_irq_modes.sh`` - - Sweeps ``cfgIrqHold`` (1, 100000) and polled mode; verifies - loopback works under each IRQ configuration + - Sweeps the emulator's ``emu_irq_mode`` (``intx`` / ``msi`` / + ``msix``). For each mode it reloads ``datadev`` across + ``cfgIrqHold`` (1, 100000) and polled mode, verifies loopback + + PRBS integrity, then asserts that ``datadev``'s probe-time + ``pci_alloc_irq_vectors`` cascade selected the matching interrupt + type. The ``emu_irq_mode`` sweep is CPU-phase only; the GPU phase + and emulator-less hosts run the legacy single-mode pass * - ``test_concurrent_open.sh`` - Two ``dmaLoopTest`` instances on different destinations simultaneously * - ``test_idx_loopback.sh`` diff --git a/emulator/driver/Makefile b/emulator/driver/Makefile index 2c9fb6da..35ee4951 100644 --- a/emulator/driver/Makefile +++ b/emulator/driver/Makefile @@ -52,7 +52,7 @@ else endif # Object files that make up the module -$(NAME)-objs := src/bar0_regs.o src/virt_irq.o src/virt_pci_host.o src/dma_engine.o src/prbs.o src/gpu_engine.o src/emu_main.o +$(NAME)-objs := src/bar0_regs.o src/virt_irq.o src/virt_msi.o src/virt_pci_host.o src/dma_engine.o src/prbs.o src/gpu_engine.o src/emu_main.o # Target module obj-m := $(NAME).o diff --git a/emulator/driver/src/dma_engine.c b/emulator/driver/src/dma_engine.c index 707cb0a6..8ecaae48 100644 --- a/emulator/driver/src/dma_engine.c +++ b/emulator/driver/src/dma_engine.c @@ -368,6 +368,31 @@ static int emu_process_tx(struct emu_dma_engine *eng) return tx_processed; } +/* ---------------------------------------------------------------- + * IRQ delivery dispatch + * ---------------------------------------------------------------- */ + +/* + * emu_dma_fire_irq - Trigger the IRQ that datadev currently has registered. + * + * In INTx mode datadev called request_irq() on emu_irq.virq, so we fire that. + * In MSI / MSI-X mode, the kernel allocated a child virq through our PCI-MSI + * domain when datadev called pci_alloc_irq_vectors(); the parent .alloc op + * stashed it in emu_msi.alloc_virq. Fire that one instead -- emu_irq.virq is + * stale (datadev never request_irq'd it). + * + * The MSI path may briefly have alloc_virq == 0 between emu_dma_start (which + * runs before the host bridge is created) and the kernel's MSI alloc during + * datadev probe; emu_msi_fire_safe() short-circuits in that window. + */ +static inline void emu_dma_fire_irq(struct emu_dma_engine *eng) +{ + if (eng->msi != NULL && eng->msi->alloc_virq != 0) + emu_msi_fire_safe(eng->msi); + else + emu_irq_fire_safe(eng->irq); +} + /* ---------------------------------------------------------------- * Polling kthread * ---------------------------------------------------------------- */ @@ -530,7 +555,7 @@ static int emu_poll_thread_fn(void *data) need_irq = true; if (need_irq) { - emu_irq_fire_safe(eng->irq); + emu_dma_fire_irq(eng); eng->irq_count++; } } @@ -592,7 +617,7 @@ static int emu_poll_thread_fn(void *data) eng->wr_ring_idx = (eng->wr_ring_idx + 1) % eng->addr_count; emu_reg_write(eng, EMU_REG_HWWRINDEX, eng->wr_ring_idx); - emu_irq_fire_safe(eng->irq); + emu_dma_fire_irq(eng); eng->irq_count++; eng->seed_pending = true; eng->seed_idx++; @@ -613,11 +638,17 @@ static int emu_poll_thread_fn(void *data) * Engine lifecycle * ---------------------------------------------------------------- */ +void emu_dma_set_msi(struct emu_dma_engine *eng, struct emu_msi *msi) +{ + eng->msi = msi; +} + int emu_dma_init(struct emu_dma_engine *eng, struct emu_bar0 *bar, struct emu_irq *irq) { eng->bar = bar; eng->irq = irq; + eng->msi = NULL; eng->reg_base = bar->uc_virt + EMU_AGEN2_OFF; eng->rd_ring = NULL; diff --git a/emulator/driver/src/dma_engine.h b/emulator/driver/src/dma_engine.h index a2111271..6cc0f0e4 100644 --- a/emulator/driver/src/dma_engine.h +++ b/emulator/driver/src/dma_engine.h @@ -32,6 +32,7 @@ #include #include "bar0_regs.h" #include "virt_irq.h" +#include "virt_msi.h" /* Maximum RX buffers in free pool (matches driver's default rxBuffers.count) */ #define EMU_FREE_POOL_MAX 4096 @@ -81,7 +82,8 @@ struct emu_free_buf { struct emu_dma_engine { /* Back-references */ struct emu_bar0 *bar; - struct emu_irq *irq; + struct emu_irq *irq; /* legacy INTx delivery (always populated) */ + struct emu_msi *msi; /* MSI/MSI-X delivery (NULL in INTx mode) */ /* Register base (bar->virt + EMU_AGEN2_OFF) */ void *reg_base; @@ -138,6 +140,7 @@ struct emu_dma_engine { int emu_dma_init(struct emu_dma_engine *eng, struct emu_bar0 *bar, struct emu_irq *irq); +void emu_dma_set_msi(struct emu_dma_engine *eng, struct emu_msi *msi); void emu_dma_start(struct emu_dma_engine *eng); void emu_dma_stop(struct emu_dma_engine *eng); void emu_dma_destroy(struct emu_dma_engine *eng); diff --git a/emulator/driver/src/emu_main.c b/emulator/driver/src/emu_main.c index 149e3ae5..3e9bc169 100644 --- a/emulator/driver/src/emu_main.c +++ b/emulator/driver/src/emu_main.c @@ -28,6 +28,7 @@ #include "bar0_regs.h" #include "virt_pci_host.h" #include "virt_irq.h" +#include "virt_msi.h" #include "dma_engine.h" #include "gpu_engine.h" /* Drain-callback registration surface from nvidia_p2p_stub. */ @@ -57,6 +58,26 @@ module_param(emu_prbs_seed, uint, 0444); MODULE_PARM_DESC(emu_prbs_seed, "PRBS seed (0 = random via get_random_u32)"); +/* + * IRQ delivery mode advertised by the virtual PCI device. Selects which + * capability block emu_pci_init_cfg_space installs in cfg_space and + * whether dev_set_msi_domain() is invoked on the enumerated pdev: + * + * "intx" (default): no caps; pcidev->irq = virq for legacy INTx delivery. + * "msi" : 32-bit MSI cap at 0x40, MSI domain attached so the kernel + * succeeds pci_alloc_irq_vectors(... PCI_IRQ_MSI). + * "msix" : MSI-X cap at 0x50, table at BAR0+0x38000 and PBA at + * BAR0+0x39000 (EMU_MSIX_TABLE_OFF / EMU_MSIX_PBA_OFF), MSI + * domain attached so pci_alloc_irq_vectors(... PCI_IRQ_MSIX) + * succeeds. + * + * tests/test_irq_modes.sh sweeps {intx, msi, msix} on every CPU-CI cell. + */ +static char *emu_irq_mode = "intx"; +module_param(emu_irq_mode, charp, 0444); +MODULE_PARM_DESC(emu_irq_mode, + "IRQ delivery mode: intx | msi | msix (default intx)"); + /* Gates a pr_info in emu_gpu_rx_tick that emits "emu_gpu: buf=%u * rx_seq=0x%08x" per RX-completed frame. Default 0 (off) so soak dmesg * is clean. Set to 1 during bring-up to observe per-buffer PRBS @@ -95,10 +116,44 @@ MODULE_PARM_DESC(emu_gpu_poll_interval_us, /* File-static global state */ static struct emu_bar0 emu_bar; static struct emu_irq emu_irq; +static struct emu_msi emu_msi; static struct emu_dma_engine emu_dma; static struct emu_pci_host emu_host; static struct emu_gpu_engine emu_gpu; +/* + * Resolved irq_mode (parsed from the emu_irq_mode modparam at init). + * Tracked separately so emu_exit() can branch teardown without re-parsing + * the string. + */ +static enum emu_irq_mode emu_irq_mode_val = EMU_IRQ_MODE_INTX; + +/** + * emu_parse_irq_mode - Map the emu_irq_mode modparam string to its enum. + * @s: modparam string (must be non-NULL) + * @out: resolved enum on success + * + * Return: 0 on success, -EINVAL on unrecognized value. + */ +static int emu_parse_irq_mode(const char *s, enum emu_irq_mode *out) +{ + if (s == NULL) + return -EINVAL; + if (!strcmp(s, "intx")) { + *out = EMU_IRQ_MODE_INTX; + return 0; + } + if (!strcmp(s, "msi")) { + *out = EMU_IRQ_MODE_MSI; + return 0; + } + if (!strcmp(s, "msix")) { + *out = EMU_IRQ_MODE_MSIX; + return 0; + } + return -EINVAL; +} + /* ---------------------------------------------------------------- * Drain callback. * @@ -135,6 +190,19 @@ static int __init emu_init(void) pr_info("%s: loading emulator module\n", EMU_MOD_NAME); + /* Validate emu_irq_mode BEFORE any allocation so a typo on the insmod + * line fails fast (-EINVAL) instead of silently coercing to a default. + * tests/test_irq_modes.sh relies on this -- a quietly-defaulted mode + * would let the test see "INTx" when it asked for "msix" and produce + * a misleading "everything passed" report. */ + ret = emu_parse_irq_mode(emu_irq_mode, &emu_irq_mode_val); + if (ret) { + pr_err("%s: invalid emu_irq_mode='%s' (expected intx|msi|msix)\n", + EMU_MOD_NAME, emu_irq_mode ? emu_irq_mode : "(null)"); + return ret; + } + pr_info("%s: irq mode = %s\n", EMU_MOD_NAME, emu_irq_mode); + /* Reject out-of-range emu_gpu_max_buffers BEFORE any allocation so * we never leak resources on a misconfigured insmod. The V4 BAR0 * MaxBuffersV4 register field is [10:0] -> upper bound 1024. Zero @@ -176,19 +244,36 @@ static int __init emu_init(void) * both track the insmod parameter. */ emu_bar0_init_regs(&emu_bar, emu_gpu_max_buffers); - /* Step 4: Create IRQ domain and virtual IRQ */ + /* Step 4: Create IRQ domain and virtual IRQ (always; legacy INTx slot + * stays populated even in MSI/MSI-X mode so cfg_space EMU_CFG_IRQ_LINE + * has a stable value and the cascade fall-through path remains valid). */ ret = emu_irq_create(&emu_irq); if (ret) { pr_err("%s: IRQ creation failed (%d)\n", EMU_MOD_NAME, ret); goto err_release_iomem; } - /* Step 5: Initialize DMA engine */ + /* Step 4b: Create the virtual PCI-MSI domain when needed. INTx mode + * skips this -- no domain is attached to the bus, no MSI cap is + * advertised, and the kernel never tries to allocate MSI vectors. */ + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) { + ret = emu_msi_create(&emu_msi); + if (ret) { + pr_err("%s: MSI domain creation failed (%d)\n", EMU_MOD_NAME, ret); + goto err_destroy_irq; + } + } + + /* Step 5: Initialize DMA engine. Wire the MSI sink so the engine fires + * the kernel-allocated child virq once datadev's pci_alloc_irq_vectors + * runs; INTx mode leaves it NULL and the engine falls back to emu_irq. */ ret = emu_dma_init(&emu_dma, &emu_bar, &emu_irq); if (ret) { pr_err("%s: DMA engine init failed (%d)\n", EMU_MOD_NAME, ret); - goto err_destroy_irq; + goto err_destroy_msi; } + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) + emu_dma_set_msi(&emu_dma, &emu_msi); /* Step 6: Start DMA engine polling BEFORE creating the PCI host bridge. * The poll thread must be running when pci_bus_add_devices() triggers @@ -237,9 +322,15 @@ static int __init emu_init(void) /* Step 7: Create PCI host bridge -- triggers bus scan and driver probe. * emu_pci_host_create() sets pcidev->irq = virq before - * pci_bus_add_devices() so the datadev driver's IRQ check at probe - * time succeeds. */ - ret = emu_pci_host_create(&emu_host, &emu_bar, emu_irq.virq); + * pci_bus_add_devices() so the datadev driver's INTx fallback path + * succeeds; for MSI/MSI-X mode it also attaches the MSI domain to the + * enumerated pdev so pci_alloc_irq_vectors() finds something to + * allocate from. */ + ret = emu_pci_host_create(&emu_host, &emu_bar, emu_irq.virq, + emu_irq_mode_val, + emu_irq_mode_val == EMU_IRQ_MODE_INTX + ? NULL + : emu_msi_get_domain(&emu_msi)); if (ret) { pr_err("%s: PCI host creation failed (%d)\n", EMU_MOD_NAME, ret); goto err_unregister_drain; @@ -278,6 +369,9 @@ static int __init emu_init(void) * target lands here because emu_dma_init failure skips the * corresponding stop step). */ emu_dma_destroy(&emu_dma); +err_destroy_msi: + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) + emu_msi_destroy(&emu_msi); err_destroy_irq: emu_irq_destroy(&emu_irq); err_release_iomem: @@ -339,12 +433,19 @@ static void __exit emu_exit(void) * any residual engine state (idempotent if stop already ran). */ emu_gpu_destroy(&emu_gpu); - /* Step 3: Destroy DMA engine -- flush and destroy workqueue after - * PCI host is gone but before IRQ domain teardown, since - * irq_work may still be pending until the workqueue is flushed */ + /* Step 3: Destroy DMA engine after the PCI host is gone. The poll + * thread (the only emu_msi_fire_safe() caller) was already stopped in + * Step 2, so no new MSI irq_work can be queued past this point. */ emu_dma_destroy(&emu_dma); - /* Step 4: Destroy IRQ domain */ + /* Step 4: Destroy MSI domain (no-op in INTx mode). emu_msi_destroy() + * runs irq_work_sync() internally to drain any fire queued just before + * the poll thread stopped, then removes the child domain before the + * parent so the kernel doesn't walk a child whose parent has gone. */ + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) + emu_msi_destroy(&emu_msi); + + /* Step 4b: Destroy parent IRQ domain */ emu_irq_destroy(&emu_irq); /* Step 5: Release BAR0 iomem window and restore the original System diff --git a/emulator/driver/src/virt_msi.c b/emulator/driver/src/virt_msi.c new file mode 100644 index 00000000..52a24436 --- /dev/null +++ b/emulator/driver/src/virt_msi.c @@ -0,0 +1,315 @@ +/** + *---------------------------------------------------------------------------- + * Company : SLAC National Accelerator Laboratory + *---------------------------------------------------------------------------- + * Description: + * Virtual PCI-MSI / MSI-X domain implementation. Provides a parent + * irq_domain backed by a software-only irq_chip and a child PCI-MSI + * domain built via pci_msi_create_irq_domain(), which the host bridge + * attaches to its bus so pci_alloc_irq_vectors() succeeds without real + * MSI hardware. + * + * Delivery model: the kernel-allocated child virq is recorded on .alloc + * so emu_msi_fire_safe() can call generic_handle_irq() directly. The + * actual MSI memory writes the kernel programs into the device's MSI + * cap (or MSI-X table) are never observed -- they land in cfg_space[] + * or in the BAR0 RAM-backed page and are simply ignored. + *---------------------------------------------------------------------------- + * This file is part of the aes_stream_drivers package. It is subject to + * the license terms in the LICENSE.txt file found in the top-level directory + * of this distribution and at: + * https://confluence.slac.stanford.edu/display/ppareg/LICENSE.html. + * No part of the aes_stream_drivers package, including this file, may be + * copied, modified, propagated, or distributed except according to the terms + * contained in the LICENSE.txt file. + *---------------------------------------------------------------------------- + **/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "virt_msi.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 19, 0) + +/* + * pci_msi_create_irq_domain() was removed in Linux 6.19 (the global PCI-MSI + * domain model was replaced by per-device msi_create_parent_irq_domain()). + * The emulator's virtual MSI/MSI-X path has not been ported to that API, so + * emulated MSI/MSI-X is unsupported on these kernels. INTx mode -- the default + * and the only mode exercised by the load/test phase of CI -- is unaffected, + * and emu_main.c only calls emu_msi_create() when emu_irq_mode != intx. + */ +int emu_msi_create(struct emu_msi *m) +{ + m->alloc_virq = 0; + m->fwnode = NULL; + m->parent_domain = NULL; + m->msi_domain = NULL; + pr_warn("emu_msi: virtual PCI-MSI/MSI-X unsupported on Linux >= 6.19 " + "(pci_msi_create_irq_domain removed); use emu_irq_mode=intx\n"); + return -EOPNOTSUPP; +} + +void emu_msi_destroy(struct emu_msi *m) +{ + /* emu_msi_create() allocates nothing on this kernel; nothing to free. */ +} + +#else /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 19, 0) */ + +/* + * Software-only irq_chip for the parent domain. The kernel calls these + * during request_irq() / free_irq() and during MSI mask/unmask. None of + * them need to do anything: delivery is via generic_handle_irq() from + * emu_msi_fire_safe(), and there's no real controller to ack/mask. + */ +static void emu_msi_irq_noop(struct irq_data *d) +{ + /* nothing to do */ +} + +/* + * Software-only MSI message composer. Required by msi_domain_activate()'s + * irq_chip_compose_msi_msg() contract -- it walks up the irq hierarchy + * and BUG_ON()s if no chip in the chain implements this op. We fill with + * zeros: the kernel writes the result into the device's MSI cap (or + * MSI-X table entry), but the emulator never observes those writes -- + * delivery is via generic_handle_irq() from emu_msi_fire_safe(), not + * via the message-address/data path that real MSI uses. + */ +static void emu_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) +{ + msg->address_lo = 0; + msg->address_hi = 0; + msg->data = 0; +} + +/* + * Software-only affinity setter. msi_domain_set_affinity() unconditionally + * dereferences parent->chip->irq_set_affinity during irq_startup() (called + * from request_threaded_irq()); a NULL slot crashes the kernel with + * RIP=0x0. Since delivery is via generic_handle_irq() from arbitrary CPU + * context (irq_work IPI / poll thread), affinity has no real meaning here + * -- accept any mask and return NOCOPY so the kernel does not re-write the + * irq_data's effective_mask. + */ +static int emu_msi_set_affinity(struct irq_data *d, const struct cpumask *m, + bool force) +{ + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip emu_msi_parent_chip = { + .name = "EMU-MSI-PARENT", + .irq_ack = emu_msi_irq_noop, + .irq_mask = emu_msi_irq_noop, + .irq_unmask = emu_msi_irq_noop, + .irq_eoi = emu_msi_irq_noop, + .irq_compose_msi_msg = emu_msi_compose_msg, + .irq_set_affinity = emu_msi_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +/** + * emu_msi_parent_alloc - Allocate hwirqs in the parent domain. + * + * Called by the kernel's PCI-MSI infrastructure when datadev calls + * pci_alloc_irq_vectors(). For each requested vector, we associate the + * caller-supplied virq with a fresh hwirq and our software chip. + * + * hwirqs come from a recycled bitmap (m->hwirq_map) rather than a + * monotonic counter: the emulator reloads datadev many times within one + * module lifetime, and a counter would leak a hwirq per cycle and + * eventually fail pci_alloc_irq_vectors() even though every prior vector + * was freed. On the -ENOSPC path no bits are set, so a failed attempt + * leaves the pool untouched. + * + * The most recently allocated child virq is stashed in the + * emu_msi state (passed via host_data) so the DMA engine can fire it + * without snooping the bound pci_dev's resources. + */ +static int emu_msi_parent_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct emu_msi *m = d->host_data; + unsigned int i; + unsigned long flags; + unsigned long hw_base; + + if (m == NULL) + return -EINVAL; + + spin_lock_irqsave(&m->hwirq_lock, flags); + hw_base = bitmap_find_next_zero_area(m->hwirq_map, EMU_MSI_HWIRQ_COUNT, + 0, nr_irqs, 0); + if (hw_base >= EMU_MSI_HWIRQ_COUNT) { + spin_unlock_irqrestore(&m->hwirq_lock, flags); + pr_err("emu_msi: hwirq pool exhausted (req=%u cap=%d)\n", + nr_irqs, EMU_MSI_HWIRQ_COUNT); + return -ENOSPC; + } + bitmap_set(m->hwirq_map, hw_base, nr_irqs); + spin_unlock_irqrestore(&m->hwirq_lock, flags); + + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(d, virq + i, hw_base + i, + &emu_msi_parent_chip, + NULL, /* chip_data */ + handle_simple_irq, + NULL, NULL); + } + + /* Track the first allocated virq for the DMA engine fire path. The + * datadev driver always allocates a single vector, so virq is the + * one that ends up in pdev->irq via pci_irq_vector(pdev, 0). */ + if (m != NULL) + m->alloc_virq = virq; + + return 0; +} + +static void emu_msi_parent_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + struct emu_msi *m = d->host_data; + struct irq_data *irqd = irq_get_irq_data(virq); + unsigned long flags; + unsigned int i; + + /* Return the contiguous hwirq run to the pool. The run is laid out + * hw_base..hw_base+nr_irqs by alloc, so the first virq's hwirq is the + * base. Recover it before irq_domain_reset_irq_data() clears it. */ + if (m != NULL && irqd != NULL) { + spin_lock_irqsave(&m->hwirq_lock, flags); + bitmap_clear(m->hwirq_map, irqd->hwirq, nr_irqs); + spin_unlock_irqrestore(&m->hwirq_lock, flags); + } + + for (i = 0; i < nr_irqs; i++) + irq_domain_reset_irq_data(irq_get_irq_data(virq + i)); + + if (m != NULL && m->alloc_virq >= virq && m->alloc_virq < virq + nr_irqs) + m->alloc_virq = 0; +} + +static const struct irq_domain_ops emu_msi_parent_ops = { + .alloc = emu_msi_parent_alloc, + .free = emu_msi_parent_free, +}; + +/* + * msi_domain_info passed to pci_msi_create_irq_domain. The default ops + * supplied by the kernel handle the MSI mask/unmask plumbing on top of + * our parent chip; MSI_FLAG_PCI_MSIX additionally allows MSI-X mode. + */ +static struct irq_chip emu_msi_chip = { + .name = "EMU-MSI", +}; + +static struct msi_domain_info emu_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | + MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .chip = &emu_msi_chip, +}; + +static void emu_msi_work_fn(struct irq_work *work) +{ + struct emu_msi *m = container_of(work, struct emu_msi, irq_work); + + if (m->alloc_virq != 0) + generic_handle_irq(m->alloc_virq); +} + +int emu_msi_create(struct emu_msi *m) +{ + m->alloc_virq = 0; + spin_lock_init(&m->hwirq_lock); + bitmap_zero(m->hwirq_map, EMU_MSI_HWIRQ_COUNT); + + m->fwnode = irq_domain_alloc_named_fwnode("emu-msi"); + if (m->fwnode == NULL) { + pr_err("emu_msi: irq_domain_alloc_named_fwnode failed\n"); + return -ENOMEM; + } + + m->parent_domain = irq_domain_create_linear(m->fwnode, + EMU_MSI_HWIRQ_COUNT, + &emu_msi_parent_ops, m); + if (m->parent_domain == NULL) { + pr_err("emu_msi: parent irq_domain_create_linear failed\n"); + irq_domain_free_fwnode(m->fwnode); + m->fwnode = NULL; + return -ENOMEM; + } + /* IRQ_DOMAIN_FLAG_MSI_PARENT is a 6.0+-only flag and is informational + * for hierarchical multi-bus MSI controllers. A simple software parent + * directly underneath pci_msi_create_irq_domain doesn't need it; omit + * for cross-kernel-version portability across the CI matrix. */ + + m->msi_domain = pci_msi_create_irq_domain(m->fwnode, + &emu_msi_domain_info, + m->parent_domain); + if (m->msi_domain == NULL) { + pr_err("emu_msi: pci_msi_create_irq_domain failed\n"); + irq_domain_remove(m->parent_domain); + m->parent_domain = NULL; + irq_domain_free_fwnode(m->fwnode); + m->fwnode = NULL; + return -ENOMEM; + } + + init_irq_work(&m->irq_work, emu_msi_work_fn); + + pr_info("emu_msi: PCI-MSI domain ready (parent=%p child=%p)\n", + m->parent_domain, m->msi_domain); + return 0; +} + +void emu_msi_destroy(struct emu_msi *m) +{ + /* Stop new fires, then wait for any irq_work queued by a last-gasp + * emu_msi_fire_safe() to finish running. Without this the deferred + * emu_msi_work_fn() could execute after the domains are torn down -- + * or after the module text is freed -- and call generic_handle_irq() + * on a stale virq. Clearing alloc_virq first makes any already-queued + * work a no-op; irq_work_sync() then guarantees it has returned. + * Safe on partially-initialized state: a zero-initialized irq_work + * has no pending flags, so irq_work_sync() returns immediately. */ + m->alloc_virq = 0; + irq_work_sync(&m->irq_work); + + if (m->msi_domain != NULL) { + irq_domain_remove(m->msi_domain); + m->msi_domain = NULL; + } + if (m->parent_domain != NULL) { + irq_domain_remove(m->parent_domain); + m->parent_domain = NULL; + } + if (m->fwnode != NULL) { + irq_domain_free_fwnode(m->fwnode); + m->fwnode = NULL; + } +} + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 19, 0) */ + +struct irq_domain *emu_msi_get_domain(const struct emu_msi *m) +{ + return m->msi_domain; +} + +void emu_msi_fire_safe(struct emu_msi *m) +{ + if (m->alloc_virq != 0) + irq_work_queue(&m->irq_work); +} diff --git a/emulator/driver/src/virt_msi.h b/emulator/driver/src/virt_msi.h new file mode 100644 index 00000000..2a7aeb80 --- /dev/null +++ b/emulator/driver/src/virt_msi.h @@ -0,0 +1,119 @@ +/** + *---------------------------------------------------------------------------- + * Company : SLAC National Accelerator Laboratory + *---------------------------------------------------------------------------- + * Description: + * Virtual PCI-MSI / MSI-X domain plumbing for the FPGA emulator. + * + * Real PCIe MSI delivery on a software-only virtual host bridge requires + * that the bridge expose an MSI irq_domain so the kernel's + * pci_alloc_irq_vectors() path actually returns a usable Linux IRQ + * instead of failing back to legacy INTx. This file builds two domains: + * + * parent (linear, 16 hwirqs) + * -> child PCI-MSI domain (built by pci_msi_create_irq_domain) + * + * The parent's .alloc op allocates a fresh hwirq, wires it to a + * software-only irq_chip whose ack/mask/unmask are no-ops (delivery + * happens via generic_handle_irq() from emu_msi_fire_safe(), not via a + * real interrupt controller). The child PCI-MSI domain wraps the parent + * using the kernel's default MSI domain/chip ops. + * + * After pci_scan_root_bus_bridge() enumerates the virtual device, the + * host bridge attaches the child domain to the pci_dev via + * dev_set_msi_domain() -- before pci_bus_add_devices() triggers driver + * probe -- so when the datadev driver calls pci_alloc_irq_vectors(... + * PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_INTX) the kernel finds an MSI + * domain to allocate from. + *---------------------------------------------------------------------------- + * This file is part of the aes_stream_drivers package. It is subject to + * the license terms in the LICENSE.txt file found in the top-level directory + * of this distribution and at: + * https://confluence.slac.stanford.edu/display/ppareg/LICENSE.html. + * No part of the aes_stream_drivers package, including this file, may be + * copied, modified, propagated, or distributed except according to the terms + * contained in the LICENSE.txt file. + *---------------------------------------------------------------------------- + **/ +#ifndef __EMU_VIRT_MSI_H__ +#define __EMU_VIRT_MSI_H__ + +#include +#include +#include +#include + +/* + * Maximum hwirqs the virtual MSI parent can hand out. The datadev driver + * only ever requests 1 vector via pci_alloc_irq_vectors(... 1, 1, ...), + * so 16 is generous future-proofing without bloating the linear domain. + * hwirqs are recycled on .free (see hwirq_map below), so this bounds the + * number of vectors live at once, not the number of load/unload cycles. + */ +#define EMU_MSI_HWIRQ_COUNT 16 + +/** + * struct emu_msi - Virtual PCI-MSI domain state + * @fwnode: synthesized fwnode_handle owned by this domain + * @parent_domain: linear hwirq->virq parent + * @msi_domain: PCI-MSI child domain (the one the kernel binds to a bus) + * @alloc_virq: most recently allocated child virq (used by emu_msi_fire) + * @irq_work: deferred fire helper, runs generic_handle_irq() in + * hardirq context via self-IPI + * @hwirq_lock: protects hwirq_map against concurrent alloc/free + * @hwirq_map: bitmap of in-use parent hwirqs; recycled on .free so + * repeated datadev reloads cannot exhaust the pool + */ +struct emu_msi { + struct fwnode_handle *fwnode; + struct irq_domain *parent_domain; + struct irq_domain *msi_domain; + unsigned int alloc_virq; + struct irq_work irq_work; + spinlock_t hwirq_lock; + DECLARE_BITMAP(hwirq_map, EMU_MSI_HWIRQ_COUNT); +}; + +/** + * emu_msi_create - Build the virtual PCI-MSI parent + child domains. + * @m: caller-allocated state structure (zero-initialized) + * + * Both domains are built unconditionally; the caller decides at host-bridge + * creation time whether to attach the resulting MSI domain to the bus + * (skip for "intx" mode, attach for "msi"/"msix" modes). + * + * Return: 0 on success, negative errno on failure + */ +int emu_msi_create(struct emu_msi *m); + +/** + * emu_msi_destroy - Tear down the virtual PCI-MSI domains. + * @m: state structure + * + * Idempotent on partially-initialized state (NULL pointer checks throughout). + */ +void emu_msi_destroy(struct emu_msi *m); + +/** + * emu_msi_get_domain - Return the PCI-MSI child domain. + * @m: state structure + * + * Caller passes this to the host bridge so the kernel can allocate + * MSI/MSI-X vectors against it during pci_alloc_irq_vectors(). + * + * Return: irq_domain pointer (or NULL if create() failed) + */ +struct irq_domain *emu_msi_get_domain(const struct emu_msi *m); + +/** + * emu_msi_fire_safe - Trigger the most recently allocated MSI virq. + * @m: state structure + * + * Mirrors emu_irq_fire_safe (legacy INTx path) but targets the kernel- + * allocated child virq instead of the linear parent. Safe to call from + * any context; uses irq_work to defer the generic_handle_irq() call to + * hardirq context. + */ +void emu_msi_fire_safe(struct emu_msi *m); + +#endif /* __EMU_VIRT_MSI_H__ */ diff --git a/emulator/driver/src/virt_pci_host.c b/emulator/driver/src/virt_pci_host.c index e2918fc9..4032d702 100644 --- a/emulator/driver/src/virt_pci_host.c +++ b/emulator/driver/src/virt_pci_host.c @@ -73,8 +73,9 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) /* Command = 0x0000 (driver sets bus master + mem space enable) */ cfg_write16(cfg, EMU_CFG_COMMAND, 0x0000); - /* Status = 0x0010 (capabilities list present) */ - cfg_write16(cfg, EMU_CFG_STATUS, 0x0010); + /* Status: the Capabilities List bit (0x0010) is set per-irq_mode below. + * INTx-only advertises no capabilities, so it stays cleared in that case. */ + cfg_write16(cfg, EMU_CFG_STATUS, 0x0000); /* Revision = 0x01 */ cfg[EMU_CFG_REVISION] = 0x01; @@ -111,8 +112,58 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) /* IRQ line = virtual IRQ number */ cfg[EMU_CFG_IRQ_LINE] = (uint8_t)(host->virq & 0xFF); - /* IRQ pin = INTA# */ + /* IRQ pin = INTA#. Always assert that the function has an INTx pin so + * pci_alloc_irq_vectors(... PCI_IRQ_INTX) succeeds even when MSI/MSI-X + * are also advertised -- the cascade picks the highest-priority kind + * that's actually advertised, but it's spec-correct for a function to + * report all three. */ cfg[EMU_CFG_IRQ_PIN] = 0x01; + + /* + * Capability list. Pick a single capability to advertise based on + * irq_mode and set the Status "Capabilities List" bit (0x0010) to match + * -- the kernel's pci_alloc_irq_vectors cascade then negotiates the + * matching path. INTx-only advertises no capability and leaves the bit + * clear so the config space stays internally consistent. + */ + switch (host->irq_mode) { + case EMU_IRQ_MODE_MSIX: { + /* 12-byte MSI-X capability at EMU_CFG_MSIX_CAP */ + uint16_t mc = 0x0000; /* Table Size = 0 (means 1 vector); Enable=0 */ + uint32_t table_bir = EMU_MSIX_TABLE_OFF | 0x0; /* BIR=0 (BAR0) */ + uint32_t pba_bir = EMU_MSIX_PBA_OFF | 0x0; /* BIR=0 (BAR0) */ + + cfg_write16(cfg, EMU_CFG_STATUS, 0x0010); + cfg[EMU_CFG_CAP_PTR] = EMU_CFG_MSIX_CAP; + cfg[EMU_CFG_MSIX_CAP + 0] = 0x11; /* Cap ID = MSI-X */ + cfg[EMU_CFG_MSIX_CAP + 1] = 0x00; /* Next = end */ + cfg_write16(cfg, EMU_CFG_MSIX_CAP + 2, mc); + cfg_write32(cfg, EMU_CFG_MSIX_CAP + 4, table_bir); + cfg_write32(cfg, EMU_CFG_MSIX_CAP + 8, pba_bir); + break; + } + case EMU_IRQ_MODE_MSI: { + /* 10-byte 32-bit MSI capability at EMU_CFG_MSI_CAP. MMC=0 means a + * single message vector is supported; 64-bit address bit (MC[7]) is + * zero so the kernel writes a 32-bit Message Address only. */ + uint16_t mc = 0x0000; /* MMC=0, MME=0, no 64-bit, no per-vec mask */ + + cfg_write16(cfg, EMU_CFG_STATUS, 0x0010); + cfg[EMU_CFG_CAP_PTR] = EMU_CFG_MSI_CAP; + cfg[EMU_CFG_MSI_CAP + 0] = 0x05; /* Cap ID = MSI */ + cfg[EMU_CFG_MSI_CAP + 1] = 0x00; /* Next = end */ + cfg_write16(cfg, EMU_CFG_MSI_CAP + 2, mc); + /* Bytes 4-7: Message Address (kernel writes during enable, RW) */ + /* Bytes 8-9: Message Data (kernel writes during enable, RW) */ + break; + } + case EMU_IRQ_MODE_INTX: + default: + /* No capability list -- legacy INTx only, matches pre-MSI behavior. + * Status "Capabilities List" bit stays clear (set to 0x0000 above). */ + cfg[EMU_CFG_CAP_PTR] = 0x00; + break; + } } /** @@ -301,16 +352,20 @@ static struct notifier_block emu_pci_nb = { */ int emu_pci_host_create(struct emu_pci_host *host, struct emu_bar0 *bar, - unsigned int virq) + unsigned int virq, + enum emu_irq_mode irq_mode, + struct irq_domain *msi_domain) { struct pci_dev *pdev; int ret; - /* Store BAR0 and IRQ references */ - host->bar = bar; - host->virq = virq; + /* Store BAR0, IRQ, and mode references */ + host->bar = bar; + host->virq = virq; + host->irq_mode = irq_mode; + host->msi_domain = msi_domain; - /* Populate PCI config space with static values */ + /* Populate PCI config space with static values (caps gated on irq_mode) */ emu_pci_init_cfg_space(host); /* Initialize the memory resource for BAR0 */ @@ -355,11 +410,12 @@ int emu_pci_host_create(struct emu_pci_host *host, pci_bus_claim_resources(host->bus); /* - * Fix pci_dev->irq: The PCI subsystem does NOT populate pcidev->irq + * Set pci_dev->irq: the PCI subsystem does NOT populate pcidev->irq * from config space byte 0x3C for virtual host bridges. The datadev - * driver checks pcidev->irq != 0 at data_dev_top.c:270 and fails - * probe if it is 0. Set the IRQ on all enumerated devices BEFORE - * pci_bus_add_devices() triggers the driver probe path. + * driver's pci_alloc_irq_vectors(... PCI_IRQ_INTX) fall-through hands + * back this value via pci_irq_vector(pdev, 0), so it must be a valid + * legacy IRQ for the INTx branch of the cascade to work. Set it on all + * enumerated devices BEFORE pci_bus_add_devices() triggers driver probe. */ for_each_pci_dev(pdev) { if (pdev->bus == host->bus) { @@ -370,6 +426,20 @@ int emu_pci_host_create(struct emu_pci_host *host, pdev->dev.cma_area = NULL; #endif set_dev_node(&pdev->dev, NUMA_NO_NODE); + + /* Attach the virtual PCI-MSI domain so the datadev driver's + * pci_alloc_irq_vectors(... PCI_IRQ_MSIX | PCI_IRQ_MSI | ...) + * call finds something to allocate from. INTX mode leaves the + * domain unset; the kernel skips MSI/MSI-X (no caps in cfg + * space) and lands on legacy INTx using pdev->irq above. + */ + if (host->irq_mode != EMU_IRQ_MODE_INTX && host->msi_domain) { + dev_set_msi_domain(&pdev->dev, host->msi_domain); + pr_info("emu: attached MSI domain to %s (mode=%s)\n", + pci_name(pdev), + host->irq_mode == EMU_IRQ_MODE_MSIX ? "msix" : "msi"); + } + pr_info("emu: set pci_dev %s irq to %u, DMA mask 64-bit\n", pci_name(pdev), virq); } diff --git a/emulator/driver/src/virt_pci_host.h b/emulator/driver/src/virt_pci_host.h index 732d3ca9..e77e4f44 100644 --- a/emulator/driver/src/virt_pci_host.h +++ b/emulator/driver/src/virt_pci_host.h @@ -48,29 +48,65 @@ #define EMU_CFG_IRQ_LINE 0x3C #define EMU_CFG_IRQ_PIN 0x3D +/* + * Capability block offsets within cfg_space. Both layouts fit comfortably + * inside the 256-byte standard config space. Only one cap is installed at + * a time -- emu_pci_init_cfg_space picks based on irq_mode. + */ +#define EMU_CFG_MSI_CAP 0x40 /* 10-byte 32-bit MSI capability */ +#define EMU_CFG_MSIX_CAP 0x50 /* 12-byte MSI-X capability */ + +/* + * MSI-X table + PBA placement inside BAR0. These must live inside the + * guaranteed-minimum BAR0 allocation: emu_bar0_alloc() may fall back as + * far as EMU_BAR0_MIN_ORDER (256 KB) under host memory fragmentation, and + * since MSI-X is advertised as the only capability, a table offset past + * the actual BAR length would make pci_alloc_irq_vectors(...MSIX...) fail + * with no MSI fall-back in cfg_space. The emulated register regions top + * out at 0x30000 (GPU_ASYNC end); 0x38000/0x39000 sit above them, in + * separate 4 KB pages, and below the 0x40000 minimum BAR size. + */ +#define EMU_MSIX_TABLE_OFF 0x00038000 +#define EMU_MSIX_PBA_OFF 0x00039000 + +/* IRQ mode selector for the virtual device */ +enum emu_irq_mode { + EMU_IRQ_MODE_INTX = 0, + EMU_IRQ_MODE_MSI = 1, + EMU_IRQ_MODE_MSIX = 2, +}; + /** * struct emu_pci_host - Virtual PCI host bridge state - * @bridge: PCI host bridge allocated by pci_alloc_host_bridge() - * @bus: root PCI bus created during bus scan + * @bridge: PCI host bridge allocated by pci_alloc_host_bridge() + * @bus: root PCI bus created during bus scan * @cfg_space: emulated PCI configuration space (256 bytes) - * @bar: pointer to BAR0 memory allocation state - * @virq: virtual IRQ number for the emulated device - * @mem_res: memory resource describing BAR0 region + * @bar: pointer to BAR0 memory allocation state + * @virq: virtual IRQ number for the emulated device (legacy INTx slot) + * @mem_res: memory resource describing BAR0 region + * @irq_mode: which IRQ delivery mode the cfg space advertises + * @msi_domain: PCI-MSI domain attached to enumerated devices when irq_mode != INTX */ struct emu_pci_host { struct pci_host_bridge *bridge; struct pci_bus *bus; uint8_t cfg_space[EMU_PCI_CFG_SIZE]; - struct emu_bar0 *bar; /* pointer to BAR0 data */ - unsigned int virq; /* virtual IRQ number */ - struct resource mem_res; /* memory resource for BAR0 */ + struct emu_bar0 *bar; + unsigned int virq; + struct resource mem_res; + enum emu_irq_mode irq_mode; + struct irq_domain *msi_domain; }; /** * emu_pci_host_create - Create virtual PCI host bridge and scan bus - * @host: host bridge state structure (caller-allocated) - * @bar: initialized BAR0 memory allocation - * @virq: virtual IRQ number to assign to the emulated device + * @host: host bridge state structure (caller-allocated) + * @bar: initialized BAR0 memory allocation + * @virq: virtual IRQ number to assign to the emulated device + * (used for legacy INTx; ignored once MSI/MSI-X is negotiated) + * @irq_mode: which IRQ kind to advertise in cfg_space + * @msi_domain: PCI-MSI irq_domain to attach to enumerated devices when + * irq_mode is MSI or MSI-X; ignored for INTX * * Creates a PCI host bridge with custom pci_ops that emulate config space * for a device matching the datadev driver's PCI ID table. Scanning the @@ -80,7 +116,9 @@ struct emu_pci_host { */ int emu_pci_host_create(struct emu_pci_host *host, struct emu_bar0 *bar, - unsigned int virq); + unsigned int virq, + enum emu_irq_mode irq_mode, + struct irq_domain *msi_domain); /** * emu_pci_host_destroy - Tear down the virtual PCI host bridge diff --git a/tests/test_irq_modes.sh b/tests/test_irq_modes.sh index 9ccd7828..7f893700 100755 --- a/tests/test_irq_modes.sh +++ b/tests/test_irq_modes.sh @@ -34,6 +34,7 @@ set -uo pipefail DEV="${DEV:-/dev/datadev_0}" APP_BIN="${APP_BIN:-data_dev/app/bin}" DATADEV_KO="${DATADEV_KO:-data_dev/driver/datadev.ko}" +EMULATOR_KO="${EMULATOR_KO:-emulator/driver/datadev_emulator.ko}" SIZE="${SIZE:-32768}" TIMEOUT_SEC="${TIMEOUT_SEC:-15}" INSMOD_TIMEOUT_SEC="${INSMOD_TIMEOUT_SEC:-120}" @@ -64,11 +65,16 @@ echo "=== IRQ behavior sweep test ===" # run_irq_cycle LABEL [insmod_params...] # Reloads datadev with the given parameters, runs a short dmaLoopTest, checks -# PRBS and dmesg. Sets CYCLE_FAIL=1 on any error. +# PRBS and dmesg. Sets CYCLE_FAIL=1 on any error. Also stashes the most +# recent "Probe: using interrupts" line in CYCLE_PROBE_LINE so the +# outer mode sweep can verify the cascade selection without doing an extra +# insmod (which would compound DMA-buffer allocation pressure across the +# 3-mode sweep and trip the cell's later load/unload tests with -ENOMEM). run_irq_cycle() { local label="$1" shift local insmod_params="$*" + CYCLE_PROBE_LINE="" echo "--- IRQ sub-test: $label ---" DMESG_BEFORE=$($SUDO dmesg | wc -l) @@ -97,6 +103,15 @@ run_irq_cycle() { fi $SUDO chmod 666 "$DEV" + # Capture the "Probe: using interrupts" line BEFORE dmaLoopTest + # runs with cfgDebug=1 -- the per-frame Process/MapReturn lines flood + # dmesg fast enough to evict the probe line out of any reasonable tail + # window within ~5 seconds. The outer mode-sweep reads this back via + # CYCLE_PROBE_LINE. + CYCLE_PROBE_LINE=$($SUDO dmesg | tail -n 80 | \ + grep -oE 'Probe: using [A-Za-z0-9-]+([ -]+[A-Za-z]+)* interrupts' | \ + tail -1 || echo "") + sleep 2 # Run dmaLoopTest; let it run until timeout (rc=124 ok). @@ -176,13 +191,105 @@ run_irq_subtest() { return 1 } -run_irq_subtest "cfgIrqHold=1" cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=1 cfgDebug=1 +# ---------------------------------------------------------------------------- +# Outer sweep: emu_irq_mode in {intx, msi, msix}. +# +# For each emulator IRQ mode, reload datadev_emulator with the requested mode +# (so cfg_space advertises the matching capability), then run the existing +# three cfgIrqHold/polled subtests against that emulator. After the +# subtests, scrape dmesg to assert datadev's probe selected the correct +# cascade branch -- a regression in data_dev_top.c's pci_alloc_irq_vectors +# call would silently land on a different branch and pass the cfg subtests +# but fail the dmesg gate below. +# +# Hosts without a built emulator (EMULATOR_KO missing) fall back to the +# legacy single-pass behavior against whichever emulator is already loaded. +# This keeps the script useful for hardware-attached test rigs where the +# real datadev hardware can't be hot-swapped between modes. +# ---------------------------------------------------------------------------- + +run_three_irq_subtests() { + local mode_label="$1" + run_irq_subtest "${mode_label}/cfgIrqHold=1" \ + cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=1 cfgDebug=1 + run_irq_subtest "${mode_label}/cfgIrqHold=100000" \ + cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=100000 cfgDebug=1 + run_irq_subtest "${mode_label}/polled" \ + cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqDis=1 cfgDebug=1 +} + +# Map an emu_irq_mode value to the dmesg substring data_dev_top.c logs at +# probe time. Used by the per-mode dmesg gate. +expected_probe_line() { + case "$1" in + intx) printf 'using legacy INTx interrupts' ;; + msi) printf 'using MSI interrupts' ;; + msix) printf 'using MSI-X interrupts' ;; + *) printf 'using ?' ;; + esac +} -# Sub-test 2: heavy IRQ coalescing (cfgIrqHold=100000). -run_irq_subtest "cfgIrqHold=100000" cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=100000 cfgDebug=1 +if [ "${GPU_ENABLED:-0}" = "1" ]; then + # GPU CI phase: skip the IRQ-mode sweep. The GPU phase exists to exercise + # the GPU-async path; reloading the emulator three times to flip MSI/MSI-X + # caps adds ~3x runtime to that already-long phase without exercising any + # GPU-specific code path. The CPU phase runs the full sweep on the same + # emulator binary, so MSI/MSI-X coverage is preserved. + echo "=== GPU_ENABLED=1; running legacy single-mode IRQ test (mode sweep is CPU-phase-only) ===" + run_three_irq_subtests "current" +elif [ -f "$EMULATOR_KO" ]; then + echo "=== Emulator detected ($EMULATOR_KO); sweeping emu_irq_mode ===" + for IRQ_MODE in intx msi msix; do + echo "============================================================" + echo " emu_irq_mode=${IRQ_MODE}" + echo "============================================================" -# Sub-test 3: polled mode (cfgIrqDis=1). -run_irq_subtest "polled" cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqDis=1 cfgDebug=1 + # Reload emulator with the requested mode. The stub stays loaded + # because the emulator depends on its symbols. + $SUDO rmmod datadev 2>/dev/null || true + $SUDO rmmod datadev_emulator 2>/dev/null || true + for _ in $(seq 1 15); do [ ! -e "$DEV" ] && break; sleep 0.5; done + + if ! timeout --kill-after=5s "${INSMOD_TIMEOUT_SEC}s" \ + $SUDO insmod "$EMULATOR_KO" emu_irq_mode="$IRQ_MODE"; then + echo "[FAIL] could not load emulator with emu_irq_mode=$IRQ_MODE" + FAILED=$((FAILED + 1)) + continue + fi + + # Run the existing three subtests; they reload datadev each cycle + # and (via run_irq_cycle) stash the post-probe dmesg snapshot in + # CYCLE_PROBE_LINE before dmaLoopTest spam can evict it. + FAIL_BEFORE=$FAILED + run_three_irq_subtests "$IRQ_MODE" + + # Mode-selection assertion. Use the probe line captured during the + # last inner subtest (dev_info renders as "datadev 0000:bb:dd.f: + # Init: Probe: using interrupts, irq=N"; we stripped to just + # the "Probe: using interrupts" middle phrase). + # + # Only run this gate when all three subtests passed. If a subtest + # failed it has already been counted, and CYCLE_PROBE_LINE may be + # empty/stale from an aborted cycle -- asserting on it here would + # double-count the failure with a misleading "probe line mismatch". + # When the subtests pass, this still catches a wrong-branch cascade + # (dataplane OK but datadev picked the wrong interrupt kind). + EXPECTED=$(expected_probe_line "$IRQ_MODE") + if [ "$FAILED" -ne "$FAIL_BEFORE" ]; then + echo "[SKIP] irq_modes ${IRQ_MODE} -- probe-line assertion skipped (a subtest already failed this mode)" + elif [ "$CYCLE_PROBE_LINE" = "Probe: ${EXPECTED}" ]; then + echo "[PASS] irq_modes ${IRQ_MODE} -- datadev selected: ${EXPECTED}" + else + echo "[FAIL] irq_modes ${IRQ_MODE} -- captured probe line mismatch:" + echo " expected: 'Probe: ${EXPECTED}'" + echo " captured: '${CYCLE_PROBE_LINE}'" + FAILED=$((FAILED + 1)) + fi + done +else + echo "=== Emulator not found ($EMULATOR_KO); legacy single-mode pass ===" + run_three_irq_subtests "current" +fi # --- Cleanup: restore default parameters --- $SUDO rmmod datadev 2>/dev/null || true From 23c84b338ae10b5096101670dd51d5449c5aacb7 Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 26 May 2026 10:02:39 -0700 Subject: [PATCH 2/7] datadev: abort probe if FPGA advertises multiple IRQ types The IRQ cascade selects by advertised capability, not by what the firmware actually delivers. A bitstream exposing both legacy INTx and MSI lets pci_alloc_irq_vectors() prefer MSI; if the user logic only drives the legacy pin the driver enables interrupts that never arrive, surfacing as DMA timeouts. Require exactly one advertised interrupt type and fail the probe with a clear message otherwise. --- common/driver/data_dev_top.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/common/driver/data_dev_top.c b/common/driver/data_dev_top.c index ac2c0196..2f29bf89 100755 --- a/common/driver/data_dev_top.c +++ b/common/driver/data_dev_top.c @@ -272,6 +272,33 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { dev->debug = cfgDebug; + // Enforce that the FPGA advertises exactly one interrupt type. The cascade + // below selects by *advertised capability*, not by what the firmware can + // actually deliver: a bitstream that exposes both legacy INTx and MSI lets + // pci_alloc_irq_vectors() prefer MSI, and if the user logic only drives the + // legacy pin the driver ends up with interrupts enabled but none arriving + // (silent DMA timeouts). Require a single type so the selection is + // unambiguous, and abort loudly on a misconfigured core instead. + { + int msiCap = pci_find_capability(pcidev, PCI_CAP_ID_MSI); + int msixCap = pci_find_capability(pcidev, PCI_CAP_ID_MSIX); + uint8_t intxPin = 0; + int irqTypes; + + pci_read_config_byte(pcidev, PCI_INTERRUPT_PIN, &intxPin); + irqTypes = (intxPin != 0) + (msiCap != 0) + (msixCap != 0); + + if (irqTypes > 1) { + dev_err(&pcidev->dev, + "%s: Probe: FPGA advertises multiple IRQ types " + "(INTx pin=%u, MSI=%s, MSI-X=%s); exactly one is required. " + "Rebuild the PCIe IP core with a single interrupt type.\n", + MOD_NAME, intxPin, msiCap ? "yes" : "no", msixCap ? "yes" : "no"); + probeReturn = -EINVAL; + goto err_unmap; + } + } + // Allocate IRQ vectors: try MSI-X first, then MSI, then legacy INTx. // pci_alloc_irq_vectors() walks the requested types in priority order // and returns the count of vectors negotiated; pci_irq_vector(pdev, 0) From 643863f20ef7821e9dc51bd3302b16af93780cdd Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 26 May 2026 10:51:41 -0700 Subject: [PATCH 3/7] datadev: abort probe if BAR0 size is not the expected 16 MB The register map is hardwired to a 16 MB window (2 * USER_SIZE: DMA engine, PCIe PHY, AxiVersion, and the user region). Firmware built with a different BAR0 size has a mismatched address map, which otherwise surfaces as out-of-bounds register access. Validate the size at probe and fail with a clear message to catch incompatible builds early. --- common/driver/data_dev_top.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/common/driver/data_dev_top.c b/common/driver/data_dev_top.c index 2f29bf89..b3f0741b 100755 --- a/common/driver/data_dev_top.c +++ b/common/driver/data_dev_top.c @@ -242,6 +242,21 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { goto err_post_en; } + // The driver's register map spans a fixed 16 MB window (2 * USER_SIZE): + // DMA engine (AGEN2_OFF), PCIe PHY (PHY_OFF), AxiVersion (AVER_OFF), and + // the user region (rwBase = base + PHY_OFF, rwSize = 2*USER_SIZE - PHY_OFF). + // A BAR0 of any other size means the firmware was built with a mismatched + // address map; reject it here with a clear message instead of letting it + // surface as out-of-bounds register access later. + if ( dev->baseSize != (2 * USER_SIZE) ) { + dev_err(&pcidev->dev, + "%s: Probe: BAR0 size 0x%x != expected 0x%x (16 MB); " + "firmware register-map mismatch.\n", + MOD_NAME, dev->baseSize, (unsigned int)(2 * USER_SIZE)); + probeReturn = -EINVAL; + goto err_post_en; + } + // Set basic device attributes dev->pcidev = pcidev; // PCI device structure dev->device = &(pcidev->dev); // Device structure From 2c762ced7197fa988ad3e6226bafb7a9da533416 Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 26 May 2026 13:25:13 -0700 Subject: [PATCH 4/7] datadev: only reject oversized BAR0 so the emulator's smaller fallback loads The exact-match BAR0 check broke emulator-backed CI: the emulator requests 16 MB but shrinks the BAR when it cannot reserve that much contiguous memory, so fragmented runners presented a 4 MB BAR and the probe rejected it. The register map only tops out at 16 MB, so reject solely oversized BARs (the mismatched-firmware case this guards against) and tolerate a smaller BAR. --- common/driver/data_dev_top.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/common/driver/data_dev_top.c b/common/driver/data_dev_top.c index b3f0741b..d497785d 100755 --- a/common/driver/data_dev_top.c +++ b/common/driver/data_dev_top.c @@ -242,15 +242,17 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { goto err_post_en; } - // The driver's register map spans a fixed 16 MB window (2 * USER_SIZE): - // DMA engine (AGEN2_OFF), PCIe PHY (PHY_OFF), AxiVersion (AVER_OFF), and - // the user region (rwBase = base + PHY_OFF, rwSize = 2*USER_SIZE - PHY_OFF). - // A BAR0 of any other size means the firmware was built with a mismatched - // address map; reject it here with a clear message instead of letting it - // surface as out-of-bounds register access later. - if ( dev->baseSize != (2 * USER_SIZE) ) { + // The driver's register map tops out at a fixed 16 MB window (2 * USER_SIZE: + // DMA engine (AGEN2_OFF), PCIe PHY (PHY_OFF), AxiVersion (AVER_OFF), and the + // user region: rwBase = base + PHY_OFF, rwSize = 2*USER_SIZE - PHY_OFF). A + // BAR0 *larger* than this means the firmware was built with a different + // address map, so reject it with a clear message instead of letting the + // mismatch surface later. A smaller BAR0 is tolerated: the emulator shrinks + // its BAR when it cannot reserve 16 MB of contiguous memory, and only the + // lower register regions are exercised in that configuration. + if ( dev->baseSize > (2 * USER_SIZE) ) { dev_err(&pcidev->dev, - "%s: Probe: BAR0 size 0x%x != expected 0x%x (16 MB); " + "%s: Probe: BAR0 size 0x%x exceeds expected 0x%x (16 MB); " "firmware register-map mismatch.\n", MOD_NAME, dev->baseSize, (unsigned int)(2 * USER_SIZE)); probeReturn = -EINVAL; From 50e6f3da467f18ff2a95f68b1754d5d6509c911c Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Tue, 26 May 2026 13:30:15 -0700 Subject: [PATCH 5/7] emulator: advertise a single IRQ type per emu_irq_mode The virtual function asserted the INTx pin unconditionally, so in msi/msix modes it advertised INTx plus the MSI/MSI-X capability. The datadev probe now rejects a function exposing more than one interrupt type, which broke the test_irq_modes.sh MSI/MSI-X subtests. Assert INTA# only in INTx mode and leave the pin clear for MSI/MSI-X so each mode advertises exactly one type, matching the single-IRQ-type firmware policy the driver enforces. --- emulator/driver/src/virt_pci_host.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/emulator/driver/src/virt_pci_host.c b/emulator/driver/src/virt_pci_host.c index 4032d702..2a78c2a8 100644 --- a/emulator/driver/src/virt_pci_host.c +++ b/emulator/driver/src/virt_pci_host.c @@ -112,12 +112,11 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) /* IRQ line = virtual IRQ number */ cfg[EMU_CFG_IRQ_LINE] = (uint8_t)(host->virq & 0xFF); - /* IRQ pin = INTA#. Always assert that the function has an INTx pin so - * pci_alloc_irq_vectors(... PCI_IRQ_INTX) succeeds even when MSI/MSI-X - * are also advertised -- the cascade picks the highest-priority kind - * that's actually advertised, but it's spec-correct for a function to - * report all three. */ - cfg[EMU_CFG_IRQ_PIN] = 0x01; + /* IRQ pin is asserted per irq_mode in the switch below: INTx mode reports + * INTA#, while MSI/MSI-X modes leave it cleared so the function advertises + * exactly one interrupt type. The datadev driver rejects a function that + * advertises more than one type, mirroring the single-IRQ-type firmware + * policy this emulator models. */ /* * Capability list. Pick a single capability to advertise based on @@ -160,8 +159,10 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) case EMU_IRQ_MODE_INTX: default: /* No capability list -- legacy INTx only, matches pre-MSI behavior. - * Status "Capabilities List" bit stays clear (set to 0x0000 above). */ + * Status "Capabilities List" bit stays clear (set to 0x0000 above). + * Assert INTA# so this is the single advertised interrupt type. */ cfg[EMU_CFG_CAP_PTR] = 0x00; + cfg[EMU_CFG_IRQ_PIN] = 0x01; break; } } From 74fb5c6fe8fb2326c8537a364b06f21eec045848 Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Sun, 31 May 2026 14:19:35 -0700 Subject: [PATCH 6/7] datadev/emulator: drop the single-IRQ-type rejection Linux pci_alloc_irq_vectors(... PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_INTX) already negotiates the priority cascade for us: it picks MSI-X if the function advertises it, falls through to MSI, then to INTx, and only returns < 0 when none of the three is available -- which the existing err_unmap path already handles. The probe-time guard rejecting any function that advertised more than one type was over-cautious: legacy PCIe IP cores routinely expose INTx alongside MSI/MSI-X in the same config space, and the kernel cascade silently does the right thing in that case. Removing the guard restores backwards compatibility with those cores. Mirror the change in the emulator: re-assert INTA# unconditionally so the virtual function can advertise INTx alongside whichever MSI/MSI-X capability emu_irq_mode selects, modeling the legacy IP-core layout. --- common/driver/data_dev_top.c | 42 ++++++----------------------- emulator/driver/src/virt_pci_host.c | 16 +++++------ 2 files changed, 16 insertions(+), 42 deletions(-) diff --git a/common/driver/data_dev_top.c b/common/driver/data_dev_top.c index d497785d..497b3f5f 100755 --- a/common/driver/data_dev_top.c +++ b/common/driver/data_dev_top.c @@ -288,38 +288,15 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { dev->cfgTimeout = cfgTimeout; dev->debug = cfgDebug; - - // Enforce that the FPGA advertises exactly one interrupt type. The cascade - // below selects by *advertised capability*, not by what the firmware can - // actually deliver: a bitstream that exposes both legacy INTx and MSI lets - // pci_alloc_irq_vectors() prefer MSI, and if the user logic only drives the - // legacy pin the driver ends up with interrupts enabled but none arriving - // (silent DMA timeouts). Require a single type so the selection is - // unambiguous, and abort loudly on a misconfigured core instead. - { - int msiCap = pci_find_capability(pcidev, PCI_CAP_ID_MSI); - int msixCap = pci_find_capability(pcidev, PCI_CAP_ID_MSIX); - uint8_t intxPin = 0; - int irqTypes; - - pci_read_config_byte(pcidev, PCI_INTERRUPT_PIN, &intxPin); - irqTypes = (intxPin != 0) + (msiCap != 0) + (msixCap != 0); - - if (irqTypes > 1) { - dev_err(&pcidev->dev, - "%s: Probe: FPGA advertises multiple IRQ types " - "(INTx pin=%u, MSI=%s, MSI-X=%s); exactly one is required. " - "Rebuild the PCIe IP core with a single interrupt type.\n", - MOD_NAME, intxPin, msiCap ? "yes" : "no", msixCap ? "yes" : "no"); - probeReturn = -EINVAL; - goto err_unmap; - } - } - // Allocate IRQ vectors: try MSI-X first, then MSI, then legacy INTx. // pci_alloc_irq_vectors() walks the requested types in priority order // and returns the count of vectors negotiated; pci_irq_vector(pdev, 0) - // returns the right Linux IRQ number regardless of which path won. + // returns the right Linux IRQ number regardless of which path won. A + // bitstream that advertises more than one type is tolerated for + // backwards compatibility with legacy PCIe IP cores that report + // INTx + MSI/MSI-X simultaneously: the cascade picks the highest + // priority kind that's actually advertised. Probe fails only if all + // three are unavailable. ret = pci_alloc_irq_vectors(pcidev, 1, 1, PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_INTX); if (ret < 0) { @@ -340,14 +317,11 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { goto err_unmap; } dev->irq = ret; - if (pcidev->msix_enabled) dev->irqType = DMA_IRQ_MSIX; - else if (pcidev->msi_enabled) dev->irqType = DMA_IRQ_MSI; - else dev->irqType = DMA_IRQ_INTX; dev_info(dev->device, "Init: Probe: using %s interrupts, irq=%u\n", - dev->irqType == DMA_IRQ_MSIX ? "MSI-X" : - dev->irqType == DMA_IRQ_MSI ? "MSI" : "legacy INTx", + pcidev->msix_enabled ? "MSI-X" : + pcidev->msi_enabled ? "MSI" : "legacy INTx", dev->irq); // Set basic device context diff --git a/emulator/driver/src/virt_pci_host.c b/emulator/driver/src/virt_pci_host.c index 2a78c2a8..9826aa91 100644 --- a/emulator/driver/src/virt_pci_host.c +++ b/emulator/driver/src/virt_pci_host.c @@ -112,11 +112,13 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) /* IRQ line = virtual IRQ number */ cfg[EMU_CFG_IRQ_LINE] = (uint8_t)(host->virq & 0xFF); - /* IRQ pin is asserted per irq_mode in the switch below: INTx mode reports - * INTA#, while MSI/MSI-X modes leave it cleared so the function advertises - * exactly one interrupt type. The datadev driver rejects a function that - * advertises more than one type, mirroring the single-IRQ-type firmware - * policy this emulator models. */ + /* IRQ pin = INTA#. Always assert that the function has an INTx pin so + * pci_alloc_irq_vectors(... PCI_IRQ_INTX) succeeds even when MSI/MSI-X + * are also advertised -- the cascade picks the highest-priority kind + * that's actually advertised, but it's spec-correct for a function to + * report all three. This also models legacy PCIe IP cores that expose + * INTx alongside MSI/MSI-X in the same config space. */ + cfg[EMU_CFG_IRQ_PIN] = 0x01; /* * Capability list. Pick a single capability to advertise based on @@ -159,10 +161,8 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) case EMU_IRQ_MODE_INTX: default: /* No capability list -- legacy INTx only, matches pre-MSI behavior. - * Status "Capabilities List" bit stays clear (set to 0x0000 above). - * Assert INTA# so this is the single advertised interrupt type. */ + * Status "Capabilities List" bit stays clear (set to 0x0000 above). */ cfg[EMU_CFG_CAP_PTR] = 0x00; - cfg[EMU_CFG_IRQ_PIN] = 0x01; break; } } From 64640eb3b6b192988d1af3e0e1a3ad482201c281 Mon Sep 17 00:00:00 2001 From: Larry Ruckman Date: Sun, 31 May 2026 14:19:45 -0700 Subject: [PATCH 7/7] datadev: drop unused DmaDevice.irqType / DMA_IRQ_* defines DmaDevice.irqType was only consumed by the dev_info() rendered immediately after it was assigned in DataDev_Probe(); nothing in the shared Dma_Init/Dma_Clean paths or any sibling driver branched on it. Read pcidev->msix_enabled / msi_enabled directly when formatting the probe log message and remove the field plus the DMA_IRQ_NONE/INTX/MSI/MSIX defines. --- common/driver/dma_common.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/common/driver/dma_common.h b/common/driver/dma_common.h index 830d9946..b9080f88 100755 --- a/common/driver/dma_common.h +++ b/common/driver/dma_common.h @@ -44,15 +44,6 @@ // Maximum number of destination channels #define DMA_MAX_DEST (8*DMA_MASK_SIZE) -// IRQ delivery type selected at probe time. Set by the PCI top-level -// driver (data_dev_top.c) based on what pci_alloc_irq_vectors() actually -// negotiated; consumed for diagnostic logging only -- the shared -// Dma_Init/Dma_Clean paths in dma_common.c do not branch on this. -#define DMA_IRQ_NONE 0 -#define DMA_IRQ_INTX 1 -#define DMA_IRQ_MSI 2 -#define DMA_IRQ_MSIX 3 - // Forward declarations struct hardware_functions; struct DmaDesc; @@ -91,7 +82,6 @@ typedef uint32_t __poll_t; * @utilData: Utility data for driver use. * @debug: Debug flag. * @irq: IRQ number. - * @irqType: IRQ delivery type (DMA_IRQ_NONE/INTX/MSI/MSIX). * @writeHwLock: Spinlock for hardware write operations. * @commandLock: Spinlock for command operations. * @maskLock: Spinlock for destination mask operations. @@ -155,7 +145,6 @@ struct DmaDevice { // IRQ uint32_t irq; - uint8_t irqType; // Locks spinlock_t writeHwLock;