diff --git a/common/driver/data_dev_top.c b/common/driver/data_dev_top.c index bcc82a9d..497b3f5f 100755 --- a/common/driver/data_dev_top.c +++ b/common/driver/data_dev_top.c @@ -39,6 +39,14 @@ #include #endif +// PCI_IRQ_LEGACY was renamed to PCI_IRQ_INTX in Linux 6.11; the old name was +// kept as a deprecated alias for a few releases, then dropped. Provide the new +// spelling on pre-6.11 kernels so the single pci_alloc_irq_vectors() call site +// builds across the CI matrix (Ubuntu 22.04 / kernel 5.15 lacks PCI_IRQ_INTX). +#ifndef PCI_IRQ_INTX +#define PCI_IRQ_INTX PCI_IRQ_LEGACY +#endif + // Init Configuration values static int cfgTxCount = 1024; static int cfgRxCount = 1024; @@ -234,6 +242,23 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { goto err_post_en; } + // The driver's register map tops out at a fixed 16 MB window (2 * USER_SIZE: + // DMA engine (AGEN2_OFF), PCIe PHY (PHY_OFF), AxiVersion (AVER_OFF), and the + // user region: rwBase = base + PHY_OFF, rwSize = 2*USER_SIZE - PHY_OFF). A + // BAR0 *larger* than this means the firmware was built with a different + // address map, so reject it with a clear message instead of letting the + // mismatch surface later. A smaller BAR0 is tolerated: the emulator shrinks + // its BAR when it cannot reserve 16 MB of contiguous memory, and only the + // lower register regions are exercised in that configuration. + if ( dev->baseSize > (2 * USER_SIZE) ) { + dev_err(&pcidev->dev, + "%s: Probe: BAR0 size 0x%x exceeds expected 0x%x (16 MB); " + "firmware register-map mismatch.\n", + MOD_NAME, dev->baseSize, (unsigned int)(2 * USER_SIZE)); + probeReturn = -EINVAL; + goto err_post_en; + } + // Set basic device attributes dev->pcidev = pcidev; // PCI device structure dev->device = &(pcidev->dev); // Device structure @@ -263,16 +288,41 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { dev->cfgTimeout = cfgTimeout; dev->debug = cfgDebug; + // Allocate IRQ vectors: try MSI-X first, then MSI, then legacy INTx. + // pci_alloc_irq_vectors() walks the requested types in priority order + // and returns the count of vectors negotiated; pci_irq_vector(pdev, 0) + // returns the right Linux IRQ number regardless of which path won. A + // bitstream that advertises more than one type is tolerated for + // backwards compatibility with legacy PCIe IP cores that report + // INTx + MSI/MSI-X simultaneously: the cascade picks the highest + // priority kind that's actually advertised. Probe fails only if all + // three are unavailable. + ret = pci_alloc_irq_vectors(pcidev, 1, 1, + PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_INTX); + if (ret < 0) { + dev_err(&pcidev->dev, + "%s: Probe: pci_alloc_irq_vectors() = %i\n", MOD_NAME, ret); + probeReturn = ret; + goto err_unmap; + } - // Assign the IRQ number from the pci_dev structure - dev->irq = pcidev->irq; - - // Check that we actually have an IRQ - if (dev->irq == 0) { - dev_err(dev->device, "%s: No IRQ associated with PCI device\n", MOD_NAME); - probeReturn = -EINVAL; + // pci_irq_vector() returns a signed errno on failure; capture it in an + // int and reject < 0 before storing into the uint32_t dev->irq, otherwise + // a negative value would wrap to a bogus IRQ and reach request_irq(). + ret = pci_irq_vector(pcidev, 0); + if (ret < 0) { + dev_err(&pcidev->dev, + "%s: Probe: pci_irq_vector() = %i\n", MOD_NAME, ret); + probeReturn = ret; goto err_unmap; } + dev->irq = ret; + + dev_info(dev->device, + "Init: Probe: using %s interrupts, irq=%u\n", + pcidev->msix_enabled ? "MSI-X" : + pcidev->msi_enabled ? "MSI" : "legacy INTx", + dev->irq); // Set basic device context @@ -347,6 +397,8 @@ int DataDev_Probe(struct pci_dev *pcidev, const struct pci_device_id *dev_id) { dev->utilData = NULL; #endif Dma_UnmapReg(dev); // Idempotent: safe even if Dma_MapReg never ran + pci_free_irq_vectors(pcidev); // Releases MSI/MSI-X/INTx allocation; no-op if + // pci_alloc_irq_vectors failed or never ran err_post_en: pci_disable_device(pcidev); // Disable PCI device on failure err_pre_en: @@ -407,9 +459,12 @@ void DataDev_Remove(struct pci_dev *pcidev) { } #endif - // Call common DMA clean function + // Call common DMA clean function (calls free_irq() internally) Dma_Clean(dev); + // Release MSI/MSI-X/INTx vectors (must follow free_irq, precede pci_disable_device) + pci_free_irq_vectors(pcidev); + // Disable device pci_disable_device(pcidev); diff --git a/docs/explanation/ci-pipeline.rst b/docs/explanation/ci-pipeline.rst index a10f9e96..4fb66c4c 100644 --- a/docs/explanation/ci-pipeline.rst +++ b/docs/explanation/ci-pipeline.rst @@ -378,8 +378,12 @@ The combined Phase 2 + Phase 3 coverage across all distributions: - 3 - Multi-destination (0, 7, 8), cross-contamination check * - IRQ modes - - 3 - - cfgIrqHold=1, cfgIrqHold=100000, polled (cfgIrqDis=1) + - 12 + - ``emu_irq_mode`` {intx, msi, msix} sweep (CPU phase). Each mode + runs cfgIrqHold=1, cfgIrqHold=100000, and polled (cfgIrqDis=1), + plus a probe-cascade assertion that ``datadev`` selected the + matching INTx / MSI / MSI-X path. GPU phase runs the single-mode + legacy pass * - Module lifecycle - 4 - 3 rapid reload cycles, rmmod-under-load, cfgMode=1/2 transitions diff --git a/docs/explanation/kernel-compatibility.rst b/docs/explanation/kernel-compatibility.rst index fe59902a..e6548334 100644 --- a/docs/explanation/kernel-compatibility.rst +++ b/docs/explanation/kernel-compatibility.rst @@ -65,6 +65,13 @@ the source files listed. ``class_create()`` dropped its first argument (``THIS_MODULE``) in 6.4. Red Hat backported this change in RHEL 9.4. +``< KERNEL_VERSION(6, 11, 0)`` — ``data_dev_top.c`` + ``PCI_IRQ_LEGACY`` was renamed to ``PCI_IRQ_INTX`` in 6.11 (the old name was kept as a + deprecated alias for a few releases, then dropped). A ``#ifndef PCI_IRQ_INTX`` guard + defines the new spelling as ``PCI_IRQ_LEGACY`` so the single ``pci_alloc_irq_vectors()`` + call site builds on pre-6.11 kernels (e.g. Ubuntu 22.04 / kernel 5.15, which lacks + ``PCI_IRQ_INTX``). + RHEL Backport Detection ------------------------ diff --git a/docs/reference/test-applications.rst b/docs/reference/test-applications.rst index 984e1f8a..47e5fcf4 100644 --- a/docs/reference/test-applications.rst +++ b/docs/reference/test-applications.rst @@ -267,8 +267,13 @@ by ``scripts/ci/test-cpu.sh`` or ``scripts/ci/test-gpu.sh`` during CI. * - ``test_backpressure.sh`` - Saturates TX buffers then verifies recovery and correct dmesg state * - ``test_irq_modes.sh`` - - Sweeps ``cfgIrqHold`` (1, 100000) and polled mode; verifies - loopback works under each IRQ configuration + - Sweeps the emulator's ``emu_irq_mode`` (``intx`` / ``msi`` / + ``msix``). For each mode it reloads ``datadev`` across + ``cfgIrqHold`` (1, 100000) and polled mode, verifies loopback + + PRBS integrity, then asserts that ``datadev``'s probe-time + ``pci_alloc_irq_vectors`` cascade selected the matching interrupt + type. The ``emu_irq_mode`` sweep is CPU-phase only; the GPU phase + and emulator-less hosts run the legacy single-mode pass * - ``test_concurrent_open.sh`` - Two ``dmaLoopTest`` instances on different destinations simultaneously * - ``test_idx_loopback.sh`` diff --git a/emulator/driver/Makefile b/emulator/driver/Makefile index 2c9fb6da..35ee4951 100644 --- a/emulator/driver/Makefile +++ b/emulator/driver/Makefile @@ -52,7 +52,7 @@ else endif # Object files that make up the module -$(NAME)-objs := src/bar0_regs.o src/virt_irq.o src/virt_pci_host.o src/dma_engine.o src/prbs.o src/gpu_engine.o src/emu_main.o +$(NAME)-objs := src/bar0_regs.o src/virt_irq.o src/virt_msi.o src/virt_pci_host.o src/dma_engine.o src/prbs.o src/gpu_engine.o src/emu_main.o # Target module obj-m := $(NAME).o diff --git a/emulator/driver/src/dma_engine.c b/emulator/driver/src/dma_engine.c index 707cb0a6..8ecaae48 100644 --- a/emulator/driver/src/dma_engine.c +++ b/emulator/driver/src/dma_engine.c @@ -368,6 +368,31 @@ static int emu_process_tx(struct emu_dma_engine *eng) return tx_processed; } +/* ---------------------------------------------------------------- + * IRQ delivery dispatch + * ---------------------------------------------------------------- */ + +/* + * emu_dma_fire_irq - Trigger the IRQ that datadev currently has registered. + * + * In INTx mode datadev called request_irq() on emu_irq.virq, so we fire that. + * In MSI / MSI-X mode, the kernel allocated a child virq through our PCI-MSI + * domain when datadev called pci_alloc_irq_vectors(); the parent .alloc op + * stashed it in emu_msi.alloc_virq. Fire that one instead -- emu_irq.virq is + * stale (datadev never request_irq'd it). + * + * The MSI path may briefly have alloc_virq == 0 between emu_dma_start (which + * runs before the host bridge is created) and the kernel's MSI alloc during + * datadev probe; emu_msi_fire_safe() short-circuits in that window. + */ +static inline void emu_dma_fire_irq(struct emu_dma_engine *eng) +{ + if (eng->msi != NULL && eng->msi->alloc_virq != 0) + emu_msi_fire_safe(eng->msi); + else + emu_irq_fire_safe(eng->irq); +} + /* ---------------------------------------------------------------- * Polling kthread * ---------------------------------------------------------------- */ @@ -530,7 +555,7 @@ static int emu_poll_thread_fn(void *data) need_irq = true; if (need_irq) { - emu_irq_fire_safe(eng->irq); + emu_dma_fire_irq(eng); eng->irq_count++; } } @@ -592,7 +617,7 @@ static int emu_poll_thread_fn(void *data) eng->wr_ring_idx = (eng->wr_ring_idx + 1) % eng->addr_count; emu_reg_write(eng, EMU_REG_HWWRINDEX, eng->wr_ring_idx); - emu_irq_fire_safe(eng->irq); + emu_dma_fire_irq(eng); eng->irq_count++; eng->seed_pending = true; eng->seed_idx++; @@ -613,11 +638,17 @@ static int emu_poll_thread_fn(void *data) * Engine lifecycle * ---------------------------------------------------------------- */ +void emu_dma_set_msi(struct emu_dma_engine *eng, struct emu_msi *msi) +{ + eng->msi = msi; +} + int emu_dma_init(struct emu_dma_engine *eng, struct emu_bar0 *bar, struct emu_irq *irq) { eng->bar = bar; eng->irq = irq; + eng->msi = NULL; eng->reg_base = bar->uc_virt + EMU_AGEN2_OFF; eng->rd_ring = NULL; diff --git a/emulator/driver/src/dma_engine.h b/emulator/driver/src/dma_engine.h index a2111271..6cc0f0e4 100644 --- a/emulator/driver/src/dma_engine.h +++ b/emulator/driver/src/dma_engine.h @@ -32,6 +32,7 @@ #include #include "bar0_regs.h" #include "virt_irq.h" +#include "virt_msi.h" /* Maximum RX buffers in free pool (matches driver's default rxBuffers.count) */ #define EMU_FREE_POOL_MAX 4096 @@ -81,7 +82,8 @@ struct emu_free_buf { struct emu_dma_engine { /* Back-references */ struct emu_bar0 *bar; - struct emu_irq *irq; + struct emu_irq *irq; /* legacy INTx delivery (always populated) */ + struct emu_msi *msi; /* MSI/MSI-X delivery (NULL in INTx mode) */ /* Register base (bar->virt + EMU_AGEN2_OFF) */ void *reg_base; @@ -138,6 +140,7 @@ struct emu_dma_engine { int emu_dma_init(struct emu_dma_engine *eng, struct emu_bar0 *bar, struct emu_irq *irq); +void emu_dma_set_msi(struct emu_dma_engine *eng, struct emu_msi *msi); void emu_dma_start(struct emu_dma_engine *eng); void emu_dma_stop(struct emu_dma_engine *eng); void emu_dma_destroy(struct emu_dma_engine *eng); diff --git a/emulator/driver/src/emu_main.c b/emulator/driver/src/emu_main.c index 149e3ae5..3e9bc169 100644 --- a/emulator/driver/src/emu_main.c +++ b/emulator/driver/src/emu_main.c @@ -28,6 +28,7 @@ #include "bar0_regs.h" #include "virt_pci_host.h" #include "virt_irq.h" +#include "virt_msi.h" #include "dma_engine.h" #include "gpu_engine.h" /* Drain-callback registration surface from nvidia_p2p_stub. */ @@ -57,6 +58,26 @@ module_param(emu_prbs_seed, uint, 0444); MODULE_PARM_DESC(emu_prbs_seed, "PRBS seed (0 = random via get_random_u32)"); +/* + * IRQ delivery mode advertised by the virtual PCI device. Selects which + * capability block emu_pci_init_cfg_space installs in cfg_space and + * whether dev_set_msi_domain() is invoked on the enumerated pdev: + * + * "intx" (default): no caps; pcidev->irq = virq for legacy INTx delivery. + * "msi" : 32-bit MSI cap at 0x40, MSI domain attached so the kernel + * succeeds pci_alloc_irq_vectors(... PCI_IRQ_MSI). + * "msix" : MSI-X cap at 0x50, table at BAR0+0x38000 and PBA at + * BAR0+0x39000 (EMU_MSIX_TABLE_OFF / EMU_MSIX_PBA_OFF), MSI + * domain attached so pci_alloc_irq_vectors(... PCI_IRQ_MSIX) + * succeeds. + * + * tests/test_irq_modes.sh sweeps {intx, msi, msix} on every CPU-CI cell. + */ +static char *emu_irq_mode = "intx"; +module_param(emu_irq_mode, charp, 0444); +MODULE_PARM_DESC(emu_irq_mode, + "IRQ delivery mode: intx | msi | msix (default intx)"); + /* Gates a pr_info in emu_gpu_rx_tick that emits "emu_gpu: buf=%u * rx_seq=0x%08x" per RX-completed frame. Default 0 (off) so soak dmesg * is clean. Set to 1 during bring-up to observe per-buffer PRBS @@ -95,10 +116,44 @@ MODULE_PARM_DESC(emu_gpu_poll_interval_us, /* File-static global state */ static struct emu_bar0 emu_bar; static struct emu_irq emu_irq; +static struct emu_msi emu_msi; static struct emu_dma_engine emu_dma; static struct emu_pci_host emu_host; static struct emu_gpu_engine emu_gpu; +/* + * Resolved irq_mode (parsed from the emu_irq_mode modparam at init). + * Tracked separately so emu_exit() can branch teardown without re-parsing + * the string. + */ +static enum emu_irq_mode emu_irq_mode_val = EMU_IRQ_MODE_INTX; + +/** + * emu_parse_irq_mode - Map the emu_irq_mode modparam string to its enum. + * @s: modparam string (must be non-NULL) + * @out: resolved enum on success + * + * Return: 0 on success, -EINVAL on unrecognized value. + */ +static int emu_parse_irq_mode(const char *s, enum emu_irq_mode *out) +{ + if (s == NULL) + return -EINVAL; + if (!strcmp(s, "intx")) { + *out = EMU_IRQ_MODE_INTX; + return 0; + } + if (!strcmp(s, "msi")) { + *out = EMU_IRQ_MODE_MSI; + return 0; + } + if (!strcmp(s, "msix")) { + *out = EMU_IRQ_MODE_MSIX; + return 0; + } + return -EINVAL; +} + /* ---------------------------------------------------------------- * Drain callback. * @@ -135,6 +190,19 @@ static int __init emu_init(void) pr_info("%s: loading emulator module\n", EMU_MOD_NAME); + /* Validate emu_irq_mode BEFORE any allocation so a typo on the insmod + * line fails fast (-EINVAL) instead of silently coercing to a default. + * tests/test_irq_modes.sh relies on this -- a quietly-defaulted mode + * would let the test see "INTx" when it asked for "msix" and produce + * a misleading "everything passed" report. */ + ret = emu_parse_irq_mode(emu_irq_mode, &emu_irq_mode_val); + if (ret) { + pr_err("%s: invalid emu_irq_mode='%s' (expected intx|msi|msix)\n", + EMU_MOD_NAME, emu_irq_mode ? emu_irq_mode : "(null)"); + return ret; + } + pr_info("%s: irq mode = %s\n", EMU_MOD_NAME, emu_irq_mode); + /* Reject out-of-range emu_gpu_max_buffers BEFORE any allocation so * we never leak resources on a misconfigured insmod. The V4 BAR0 * MaxBuffersV4 register field is [10:0] -> upper bound 1024. Zero @@ -176,19 +244,36 @@ static int __init emu_init(void) * both track the insmod parameter. */ emu_bar0_init_regs(&emu_bar, emu_gpu_max_buffers); - /* Step 4: Create IRQ domain and virtual IRQ */ + /* Step 4: Create IRQ domain and virtual IRQ (always; legacy INTx slot + * stays populated even in MSI/MSI-X mode so cfg_space EMU_CFG_IRQ_LINE + * has a stable value and the cascade fall-through path remains valid). */ ret = emu_irq_create(&emu_irq); if (ret) { pr_err("%s: IRQ creation failed (%d)\n", EMU_MOD_NAME, ret); goto err_release_iomem; } - /* Step 5: Initialize DMA engine */ + /* Step 4b: Create the virtual PCI-MSI domain when needed. INTx mode + * skips this -- no domain is attached to the bus, no MSI cap is + * advertised, and the kernel never tries to allocate MSI vectors. */ + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) { + ret = emu_msi_create(&emu_msi); + if (ret) { + pr_err("%s: MSI domain creation failed (%d)\n", EMU_MOD_NAME, ret); + goto err_destroy_irq; + } + } + + /* Step 5: Initialize DMA engine. Wire the MSI sink so the engine fires + * the kernel-allocated child virq once datadev's pci_alloc_irq_vectors + * runs; INTx mode leaves it NULL and the engine falls back to emu_irq. */ ret = emu_dma_init(&emu_dma, &emu_bar, &emu_irq); if (ret) { pr_err("%s: DMA engine init failed (%d)\n", EMU_MOD_NAME, ret); - goto err_destroy_irq; + goto err_destroy_msi; } + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) + emu_dma_set_msi(&emu_dma, &emu_msi); /* Step 6: Start DMA engine polling BEFORE creating the PCI host bridge. * The poll thread must be running when pci_bus_add_devices() triggers @@ -237,9 +322,15 @@ static int __init emu_init(void) /* Step 7: Create PCI host bridge -- triggers bus scan and driver probe. * emu_pci_host_create() sets pcidev->irq = virq before - * pci_bus_add_devices() so the datadev driver's IRQ check at probe - * time succeeds. */ - ret = emu_pci_host_create(&emu_host, &emu_bar, emu_irq.virq); + * pci_bus_add_devices() so the datadev driver's INTx fallback path + * succeeds; for MSI/MSI-X mode it also attaches the MSI domain to the + * enumerated pdev so pci_alloc_irq_vectors() finds something to + * allocate from. */ + ret = emu_pci_host_create(&emu_host, &emu_bar, emu_irq.virq, + emu_irq_mode_val, + emu_irq_mode_val == EMU_IRQ_MODE_INTX + ? NULL + : emu_msi_get_domain(&emu_msi)); if (ret) { pr_err("%s: PCI host creation failed (%d)\n", EMU_MOD_NAME, ret); goto err_unregister_drain; @@ -278,6 +369,9 @@ static int __init emu_init(void) * target lands here because emu_dma_init failure skips the * corresponding stop step). */ emu_dma_destroy(&emu_dma); +err_destroy_msi: + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) + emu_msi_destroy(&emu_msi); err_destroy_irq: emu_irq_destroy(&emu_irq); err_release_iomem: @@ -339,12 +433,19 @@ static void __exit emu_exit(void) * any residual engine state (idempotent if stop already ran). */ emu_gpu_destroy(&emu_gpu); - /* Step 3: Destroy DMA engine -- flush and destroy workqueue after - * PCI host is gone but before IRQ domain teardown, since - * irq_work may still be pending until the workqueue is flushed */ + /* Step 3: Destroy DMA engine after the PCI host is gone. The poll + * thread (the only emu_msi_fire_safe() caller) was already stopped in + * Step 2, so no new MSI irq_work can be queued past this point. */ emu_dma_destroy(&emu_dma); - /* Step 4: Destroy IRQ domain */ + /* Step 4: Destroy MSI domain (no-op in INTx mode). emu_msi_destroy() + * runs irq_work_sync() internally to drain any fire queued just before + * the poll thread stopped, then removes the child domain before the + * parent so the kernel doesn't walk a child whose parent has gone. */ + if (emu_irq_mode_val != EMU_IRQ_MODE_INTX) + emu_msi_destroy(&emu_msi); + + /* Step 4b: Destroy parent IRQ domain */ emu_irq_destroy(&emu_irq); /* Step 5: Release BAR0 iomem window and restore the original System diff --git a/emulator/driver/src/virt_msi.c b/emulator/driver/src/virt_msi.c new file mode 100644 index 00000000..52a24436 --- /dev/null +++ b/emulator/driver/src/virt_msi.c @@ -0,0 +1,315 @@ +/** + *---------------------------------------------------------------------------- + * Company : SLAC National Accelerator Laboratory + *---------------------------------------------------------------------------- + * Description: + * Virtual PCI-MSI / MSI-X domain implementation. Provides a parent + * irq_domain backed by a software-only irq_chip and a child PCI-MSI + * domain built via pci_msi_create_irq_domain(), which the host bridge + * attaches to its bus so pci_alloc_irq_vectors() succeeds without real + * MSI hardware. + * + * Delivery model: the kernel-allocated child virq is recorded on .alloc + * so emu_msi_fire_safe() can call generic_handle_irq() directly. The + * actual MSI memory writes the kernel programs into the device's MSI + * cap (or MSI-X table) are never observed -- they land in cfg_space[] + * or in the BAR0 RAM-backed page and are simply ignored. + *---------------------------------------------------------------------------- + * This file is part of the aes_stream_drivers package. It is subject to + * the license terms in the LICENSE.txt file found in the top-level directory + * of this distribution and at: + * https://confluence.slac.stanford.edu/display/ppareg/LICENSE.html. + * No part of the aes_stream_drivers package, including this file, may be + * copied, modified, propagated, or distributed except according to the terms + * contained in the LICENSE.txt file. + *---------------------------------------------------------------------------- + **/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "virt_msi.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 19, 0) + +/* + * pci_msi_create_irq_domain() was removed in Linux 6.19 (the global PCI-MSI + * domain model was replaced by per-device msi_create_parent_irq_domain()). + * The emulator's virtual MSI/MSI-X path has not been ported to that API, so + * emulated MSI/MSI-X is unsupported on these kernels. INTx mode -- the default + * and the only mode exercised by the load/test phase of CI -- is unaffected, + * and emu_main.c only calls emu_msi_create() when emu_irq_mode != intx. + */ +int emu_msi_create(struct emu_msi *m) +{ + m->alloc_virq = 0; + m->fwnode = NULL; + m->parent_domain = NULL; + m->msi_domain = NULL; + pr_warn("emu_msi: virtual PCI-MSI/MSI-X unsupported on Linux >= 6.19 " + "(pci_msi_create_irq_domain removed); use emu_irq_mode=intx\n"); + return -EOPNOTSUPP; +} + +void emu_msi_destroy(struct emu_msi *m) +{ + /* emu_msi_create() allocates nothing on this kernel; nothing to free. */ +} + +#else /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 19, 0) */ + +/* + * Software-only irq_chip for the parent domain. The kernel calls these + * during request_irq() / free_irq() and during MSI mask/unmask. None of + * them need to do anything: delivery is via generic_handle_irq() from + * emu_msi_fire_safe(), and there's no real controller to ack/mask. + */ +static void emu_msi_irq_noop(struct irq_data *d) +{ + /* nothing to do */ +} + +/* + * Software-only MSI message composer. Required by msi_domain_activate()'s + * irq_chip_compose_msi_msg() contract -- it walks up the irq hierarchy + * and BUG_ON()s if no chip in the chain implements this op. We fill with + * zeros: the kernel writes the result into the device's MSI cap (or + * MSI-X table entry), but the emulator never observes those writes -- + * delivery is via generic_handle_irq() from emu_msi_fire_safe(), not + * via the message-address/data path that real MSI uses. + */ +static void emu_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) +{ + msg->address_lo = 0; + msg->address_hi = 0; + msg->data = 0; +} + +/* + * Software-only affinity setter. msi_domain_set_affinity() unconditionally + * dereferences parent->chip->irq_set_affinity during irq_startup() (called + * from request_threaded_irq()); a NULL slot crashes the kernel with + * RIP=0x0. Since delivery is via generic_handle_irq() from arbitrary CPU + * context (irq_work IPI / poll thread), affinity has no real meaning here + * -- accept any mask and return NOCOPY so the kernel does not re-write the + * irq_data's effective_mask. + */ +static int emu_msi_set_affinity(struct irq_data *d, const struct cpumask *m, + bool force) +{ + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip emu_msi_parent_chip = { + .name = "EMU-MSI-PARENT", + .irq_ack = emu_msi_irq_noop, + .irq_mask = emu_msi_irq_noop, + .irq_unmask = emu_msi_irq_noop, + .irq_eoi = emu_msi_irq_noop, + .irq_compose_msi_msg = emu_msi_compose_msg, + .irq_set_affinity = emu_msi_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +/** + * emu_msi_parent_alloc - Allocate hwirqs in the parent domain. + * + * Called by the kernel's PCI-MSI infrastructure when datadev calls + * pci_alloc_irq_vectors(). For each requested vector, we associate the + * caller-supplied virq with a fresh hwirq and our software chip. + * + * hwirqs come from a recycled bitmap (m->hwirq_map) rather than a + * monotonic counter: the emulator reloads datadev many times within one + * module lifetime, and a counter would leak a hwirq per cycle and + * eventually fail pci_alloc_irq_vectors() even though every prior vector + * was freed. On the -ENOSPC path no bits are set, so a failed attempt + * leaves the pool untouched. + * + * The most recently allocated child virq is stashed in the + * emu_msi state (passed via host_data) so the DMA engine can fire it + * without snooping the bound pci_dev's resources. + */ +static int emu_msi_parent_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct emu_msi *m = d->host_data; + unsigned int i; + unsigned long flags; + unsigned long hw_base; + + if (m == NULL) + return -EINVAL; + + spin_lock_irqsave(&m->hwirq_lock, flags); + hw_base = bitmap_find_next_zero_area(m->hwirq_map, EMU_MSI_HWIRQ_COUNT, + 0, nr_irqs, 0); + if (hw_base >= EMU_MSI_HWIRQ_COUNT) { + spin_unlock_irqrestore(&m->hwirq_lock, flags); + pr_err("emu_msi: hwirq pool exhausted (req=%u cap=%d)\n", + nr_irqs, EMU_MSI_HWIRQ_COUNT); + return -ENOSPC; + } + bitmap_set(m->hwirq_map, hw_base, nr_irqs); + spin_unlock_irqrestore(&m->hwirq_lock, flags); + + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(d, virq + i, hw_base + i, + &emu_msi_parent_chip, + NULL, /* chip_data */ + handle_simple_irq, + NULL, NULL); + } + + /* Track the first allocated virq for the DMA engine fire path. The + * datadev driver always allocates a single vector, so virq is the + * one that ends up in pdev->irq via pci_irq_vector(pdev, 0). */ + if (m != NULL) + m->alloc_virq = virq; + + return 0; +} + +static void emu_msi_parent_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + struct emu_msi *m = d->host_data; + struct irq_data *irqd = irq_get_irq_data(virq); + unsigned long flags; + unsigned int i; + + /* Return the contiguous hwirq run to the pool. The run is laid out + * hw_base..hw_base+nr_irqs by alloc, so the first virq's hwirq is the + * base. Recover it before irq_domain_reset_irq_data() clears it. */ + if (m != NULL && irqd != NULL) { + spin_lock_irqsave(&m->hwirq_lock, flags); + bitmap_clear(m->hwirq_map, irqd->hwirq, nr_irqs); + spin_unlock_irqrestore(&m->hwirq_lock, flags); + } + + for (i = 0; i < nr_irqs; i++) + irq_domain_reset_irq_data(irq_get_irq_data(virq + i)); + + if (m != NULL && m->alloc_virq >= virq && m->alloc_virq < virq + nr_irqs) + m->alloc_virq = 0; +} + +static const struct irq_domain_ops emu_msi_parent_ops = { + .alloc = emu_msi_parent_alloc, + .free = emu_msi_parent_free, +}; + +/* + * msi_domain_info passed to pci_msi_create_irq_domain. The default ops + * supplied by the kernel handle the MSI mask/unmask plumbing on top of + * our parent chip; MSI_FLAG_PCI_MSIX additionally allows MSI-X mode. + */ +static struct irq_chip emu_msi_chip = { + .name = "EMU-MSI", +}; + +static struct msi_domain_info emu_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | + MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .chip = &emu_msi_chip, +}; + +static void emu_msi_work_fn(struct irq_work *work) +{ + struct emu_msi *m = container_of(work, struct emu_msi, irq_work); + + if (m->alloc_virq != 0) + generic_handle_irq(m->alloc_virq); +} + +int emu_msi_create(struct emu_msi *m) +{ + m->alloc_virq = 0; + spin_lock_init(&m->hwirq_lock); + bitmap_zero(m->hwirq_map, EMU_MSI_HWIRQ_COUNT); + + m->fwnode = irq_domain_alloc_named_fwnode("emu-msi"); + if (m->fwnode == NULL) { + pr_err("emu_msi: irq_domain_alloc_named_fwnode failed\n"); + return -ENOMEM; + } + + m->parent_domain = irq_domain_create_linear(m->fwnode, + EMU_MSI_HWIRQ_COUNT, + &emu_msi_parent_ops, m); + if (m->parent_domain == NULL) { + pr_err("emu_msi: parent irq_domain_create_linear failed\n"); + irq_domain_free_fwnode(m->fwnode); + m->fwnode = NULL; + return -ENOMEM; + } + /* IRQ_DOMAIN_FLAG_MSI_PARENT is a 6.0+-only flag and is informational + * for hierarchical multi-bus MSI controllers. A simple software parent + * directly underneath pci_msi_create_irq_domain doesn't need it; omit + * for cross-kernel-version portability across the CI matrix. */ + + m->msi_domain = pci_msi_create_irq_domain(m->fwnode, + &emu_msi_domain_info, + m->parent_domain); + if (m->msi_domain == NULL) { + pr_err("emu_msi: pci_msi_create_irq_domain failed\n"); + irq_domain_remove(m->parent_domain); + m->parent_domain = NULL; + irq_domain_free_fwnode(m->fwnode); + m->fwnode = NULL; + return -ENOMEM; + } + + init_irq_work(&m->irq_work, emu_msi_work_fn); + + pr_info("emu_msi: PCI-MSI domain ready (parent=%p child=%p)\n", + m->parent_domain, m->msi_domain); + return 0; +} + +void emu_msi_destroy(struct emu_msi *m) +{ + /* Stop new fires, then wait for any irq_work queued by a last-gasp + * emu_msi_fire_safe() to finish running. Without this the deferred + * emu_msi_work_fn() could execute after the domains are torn down -- + * or after the module text is freed -- and call generic_handle_irq() + * on a stale virq. Clearing alloc_virq first makes any already-queued + * work a no-op; irq_work_sync() then guarantees it has returned. + * Safe on partially-initialized state: a zero-initialized irq_work + * has no pending flags, so irq_work_sync() returns immediately. */ + m->alloc_virq = 0; + irq_work_sync(&m->irq_work); + + if (m->msi_domain != NULL) { + irq_domain_remove(m->msi_domain); + m->msi_domain = NULL; + } + if (m->parent_domain != NULL) { + irq_domain_remove(m->parent_domain); + m->parent_domain = NULL; + } + if (m->fwnode != NULL) { + irq_domain_free_fwnode(m->fwnode); + m->fwnode = NULL; + } +} + +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6, 19, 0) */ + +struct irq_domain *emu_msi_get_domain(const struct emu_msi *m) +{ + return m->msi_domain; +} + +void emu_msi_fire_safe(struct emu_msi *m) +{ + if (m->alloc_virq != 0) + irq_work_queue(&m->irq_work); +} diff --git a/emulator/driver/src/virt_msi.h b/emulator/driver/src/virt_msi.h new file mode 100644 index 00000000..2a7aeb80 --- /dev/null +++ b/emulator/driver/src/virt_msi.h @@ -0,0 +1,119 @@ +/** + *---------------------------------------------------------------------------- + * Company : SLAC National Accelerator Laboratory + *---------------------------------------------------------------------------- + * Description: + * Virtual PCI-MSI / MSI-X domain plumbing for the FPGA emulator. + * + * Real PCIe MSI delivery on a software-only virtual host bridge requires + * that the bridge expose an MSI irq_domain so the kernel's + * pci_alloc_irq_vectors() path actually returns a usable Linux IRQ + * instead of failing back to legacy INTx. This file builds two domains: + * + * parent (linear, 16 hwirqs) + * -> child PCI-MSI domain (built by pci_msi_create_irq_domain) + * + * The parent's .alloc op allocates a fresh hwirq, wires it to a + * software-only irq_chip whose ack/mask/unmask are no-ops (delivery + * happens via generic_handle_irq() from emu_msi_fire_safe(), not via a + * real interrupt controller). The child PCI-MSI domain wraps the parent + * using the kernel's default MSI domain/chip ops. + * + * After pci_scan_root_bus_bridge() enumerates the virtual device, the + * host bridge attaches the child domain to the pci_dev via + * dev_set_msi_domain() -- before pci_bus_add_devices() triggers driver + * probe -- so when the datadev driver calls pci_alloc_irq_vectors(... + * PCI_IRQ_MSIX | PCI_IRQ_MSI | PCI_IRQ_INTX) the kernel finds an MSI + * domain to allocate from. + *---------------------------------------------------------------------------- + * This file is part of the aes_stream_drivers package. It is subject to + * the license terms in the LICENSE.txt file found in the top-level directory + * of this distribution and at: + * https://confluence.slac.stanford.edu/display/ppareg/LICENSE.html. + * No part of the aes_stream_drivers package, including this file, may be + * copied, modified, propagated, or distributed except according to the terms + * contained in the LICENSE.txt file. + *---------------------------------------------------------------------------- + **/ +#ifndef __EMU_VIRT_MSI_H__ +#define __EMU_VIRT_MSI_H__ + +#include +#include +#include +#include + +/* + * Maximum hwirqs the virtual MSI parent can hand out. The datadev driver + * only ever requests 1 vector via pci_alloc_irq_vectors(... 1, 1, ...), + * so 16 is generous future-proofing without bloating the linear domain. + * hwirqs are recycled on .free (see hwirq_map below), so this bounds the + * number of vectors live at once, not the number of load/unload cycles. + */ +#define EMU_MSI_HWIRQ_COUNT 16 + +/** + * struct emu_msi - Virtual PCI-MSI domain state + * @fwnode: synthesized fwnode_handle owned by this domain + * @parent_domain: linear hwirq->virq parent + * @msi_domain: PCI-MSI child domain (the one the kernel binds to a bus) + * @alloc_virq: most recently allocated child virq (used by emu_msi_fire) + * @irq_work: deferred fire helper, runs generic_handle_irq() in + * hardirq context via self-IPI + * @hwirq_lock: protects hwirq_map against concurrent alloc/free + * @hwirq_map: bitmap of in-use parent hwirqs; recycled on .free so + * repeated datadev reloads cannot exhaust the pool + */ +struct emu_msi { + struct fwnode_handle *fwnode; + struct irq_domain *parent_domain; + struct irq_domain *msi_domain; + unsigned int alloc_virq; + struct irq_work irq_work; + spinlock_t hwirq_lock; + DECLARE_BITMAP(hwirq_map, EMU_MSI_HWIRQ_COUNT); +}; + +/** + * emu_msi_create - Build the virtual PCI-MSI parent + child domains. + * @m: caller-allocated state structure (zero-initialized) + * + * Both domains are built unconditionally; the caller decides at host-bridge + * creation time whether to attach the resulting MSI domain to the bus + * (skip for "intx" mode, attach for "msi"/"msix" modes). + * + * Return: 0 on success, negative errno on failure + */ +int emu_msi_create(struct emu_msi *m); + +/** + * emu_msi_destroy - Tear down the virtual PCI-MSI domains. + * @m: state structure + * + * Idempotent on partially-initialized state (NULL pointer checks throughout). + */ +void emu_msi_destroy(struct emu_msi *m); + +/** + * emu_msi_get_domain - Return the PCI-MSI child domain. + * @m: state structure + * + * Caller passes this to the host bridge so the kernel can allocate + * MSI/MSI-X vectors against it during pci_alloc_irq_vectors(). + * + * Return: irq_domain pointer (or NULL if create() failed) + */ +struct irq_domain *emu_msi_get_domain(const struct emu_msi *m); + +/** + * emu_msi_fire_safe - Trigger the most recently allocated MSI virq. + * @m: state structure + * + * Mirrors emu_irq_fire_safe (legacy INTx path) but targets the kernel- + * allocated child virq instead of the linear parent. Safe to call from + * any context; uses irq_work to defer the generic_handle_irq() call to + * hardirq context. + */ +void emu_msi_fire_safe(struct emu_msi *m); + +#endif /* __EMU_VIRT_MSI_H__ */ diff --git a/emulator/driver/src/virt_pci_host.c b/emulator/driver/src/virt_pci_host.c index e2918fc9..9826aa91 100644 --- a/emulator/driver/src/virt_pci_host.c +++ b/emulator/driver/src/virt_pci_host.c @@ -73,8 +73,9 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) /* Command = 0x0000 (driver sets bus master + mem space enable) */ cfg_write16(cfg, EMU_CFG_COMMAND, 0x0000); - /* Status = 0x0010 (capabilities list present) */ - cfg_write16(cfg, EMU_CFG_STATUS, 0x0010); + /* Status: the Capabilities List bit (0x0010) is set per-irq_mode below. + * INTx-only advertises no capabilities, so it stays cleared in that case. */ + cfg_write16(cfg, EMU_CFG_STATUS, 0x0000); /* Revision = 0x01 */ cfg[EMU_CFG_REVISION] = 0x01; @@ -111,8 +112,59 @@ static void emu_pci_init_cfg_space(struct emu_pci_host *host) /* IRQ line = virtual IRQ number */ cfg[EMU_CFG_IRQ_LINE] = (uint8_t)(host->virq & 0xFF); - /* IRQ pin = INTA# */ + /* IRQ pin = INTA#. Always assert that the function has an INTx pin so + * pci_alloc_irq_vectors(... PCI_IRQ_INTX) succeeds even when MSI/MSI-X + * are also advertised -- the cascade picks the highest-priority kind + * that's actually advertised, but it's spec-correct for a function to + * report all three. This also models legacy PCIe IP cores that expose + * INTx alongside MSI/MSI-X in the same config space. */ cfg[EMU_CFG_IRQ_PIN] = 0x01; + + /* + * Capability list. Pick a single capability to advertise based on + * irq_mode and set the Status "Capabilities List" bit (0x0010) to match + * -- the kernel's pci_alloc_irq_vectors cascade then negotiates the + * matching path. INTx-only advertises no capability and leaves the bit + * clear so the config space stays internally consistent. + */ + switch (host->irq_mode) { + case EMU_IRQ_MODE_MSIX: { + /* 12-byte MSI-X capability at EMU_CFG_MSIX_CAP */ + uint16_t mc = 0x0000; /* Table Size = 0 (means 1 vector); Enable=0 */ + uint32_t table_bir = EMU_MSIX_TABLE_OFF | 0x0; /* BIR=0 (BAR0) */ + uint32_t pba_bir = EMU_MSIX_PBA_OFF | 0x0; /* BIR=0 (BAR0) */ + + cfg_write16(cfg, EMU_CFG_STATUS, 0x0010); + cfg[EMU_CFG_CAP_PTR] = EMU_CFG_MSIX_CAP; + cfg[EMU_CFG_MSIX_CAP + 0] = 0x11; /* Cap ID = MSI-X */ + cfg[EMU_CFG_MSIX_CAP + 1] = 0x00; /* Next = end */ + cfg_write16(cfg, EMU_CFG_MSIX_CAP + 2, mc); + cfg_write32(cfg, EMU_CFG_MSIX_CAP + 4, table_bir); + cfg_write32(cfg, EMU_CFG_MSIX_CAP + 8, pba_bir); + break; + } + case EMU_IRQ_MODE_MSI: { + /* 10-byte 32-bit MSI capability at EMU_CFG_MSI_CAP. MMC=0 means a + * single message vector is supported; 64-bit address bit (MC[7]) is + * zero so the kernel writes a 32-bit Message Address only. */ + uint16_t mc = 0x0000; /* MMC=0, MME=0, no 64-bit, no per-vec mask */ + + cfg_write16(cfg, EMU_CFG_STATUS, 0x0010); + cfg[EMU_CFG_CAP_PTR] = EMU_CFG_MSI_CAP; + cfg[EMU_CFG_MSI_CAP + 0] = 0x05; /* Cap ID = MSI */ + cfg[EMU_CFG_MSI_CAP + 1] = 0x00; /* Next = end */ + cfg_write16(cfg, EMU_CFG_MSI_CAP + 2, mc); + /* Bytes 4-7: Message Address (kernel writes during enable, RW) */ + /* Bytes 8-9: Message Data (kernel writes during enable, RW) */ + break; + } + case EMU_IRQ_MODE_INTX: + default: + /* No capability list -- legacy INTx only, matches pre-MSI behavior. + * Status "Capabilities List" bit stays clear (set to 0x0000 above). */ + cfg[EMU_CFG_CAP_PTR] = 0x00; + break; + } } /** @@ -301,16 +353,20 @@ static struct notifier_block emu_pci_nb = { */ int emu_pci_host_create(struct emu_pci_host *host, struct emu_bar0 *bar, - unsigned int virq) + unsigned int virq, + enum emu_irq_mode irq_mode, + struct irq_domain *msi_domain) { struct pci_dev *pdev; int ret; - /* Store BAR0 and IRQ references */ - host->bar = bar; - host->virq = virq; + /* Store BAR0, IRQ, and mode references */ + host->bar = bar; + host->virq = virq; + host->irq_mode = irq_mode; + host->msi_domain = msi_domain; - /* Populate PCI config space with static values */ + /* Populate PCI config space with static values (caps gated on irq_mode) */ emu_pci_init_cfg_space(host); /* Initialize the memory resource for BAR0 */ @@ -355,11 +411,12 @@ int emu_pci_host_create(struct emu_pci_host *host, pci_bus_claim_resources(host->bus); /* - * Fix pci_dev->irq: The PCI subsystem does NOT populate pcidev->irq + * Set pci_dev->irq: the PCI subsystem does NOT populate pcidev->irq * from config space byte 0x3C for virtual host bridges. The datadev - * driver checks pcidev->irq != 0 at data_dev_top.c:270 and fails - * probe if it is 0. Set the IRQ on all enumerated devices BEFORE - * pci_bus_add_devices() triggers the driver probe path. + * driver's pci_alloc_irq_vectors(... PCI_IRQ_INTX) fall-through hands + * back this value via pci_irq_vector(pdev, 0), so it must be a valid + * legacy IRQ for the INTx branch of the cascade to work. Set it on all + * enumerated devices BEFORE pci_bus_add_devices() triggers driver probe. */ for_each_pci_dev(pdev) { if (pdev->bus == host->bus) { @@ -370,6 +427,20 @@ int emu_pci_host_create(struct emu_pci_host *host, pdev->dev.cma_area = NULL; #endif set_dev_node(&pdev->dev, NUMA_NO_NODE); + + /* Attach the virtual PCI-MSI domain so the datadev driver's + * pci_alloc_irq_vectors(... PCI_IRQ_MSIX | PCI_IRQ_MSI | ...) + * call finds something to allocate from. INTX mode leaves the + * domain unset; the kernel skips MSI/MSI-X (no caps in cfg + * space) and lands on legacy INTx using pdev->irq above. + */ + if (host->irq_mode != EMU_IRQ_MODE_INTX && host->msi_domain) { + dev_set_msi_domain(&pdev->dev, host->msi_domain); + pr_info("emu: attached MSI domain to %s (mode=%s)\n", + pci_name(pdev), + host->irq_mode == EMU_IRQ_MODE_MSIX ? "msix" : "msi"); + } + pr_info("emu: set pci_dev %s irq to %u, DMA mask 64-bit\n", pci_name(pdev), virq); } diff --git a/emulator/driver/src/virt_pci_host.h b/emulator/driver/src/virt_pci_host.h index 732d3ca9..e77e4f44 100644 --- a/emulator/driver/src/virt_pci_host.h +++ b/emulator/driver/src/virt_pci_host.h @@ -48,29 +48,65 @@ #define EMU_CFG_IRQ_LINE 0x3C #define EMU_CFG_IRQ_PIN 0x3D +/* + * Capability block offsets within cfg_space. Both layouts fit comfortably + * inside the 256-byte standard config space. Only one cap is installed at + * a time -- emu_pci_init_cfg_space picks based on irq_mode. + */ +#define EMU_CFG_MSI_CAP 0x40 /* 10-byte 32-bit MSI capability */ +#define EMU_CFG_MSIX_CAP 0x50 /* 12-byte MSI-X capability */ + +/* + * MSI-X table + PBA placement inside BAR0. These must live inside the + * guaranteed-minimum BAR0 allocation: emu_bar0_alloc() may fall back as + * far as EMU_BAR0_MIN_ORDER (256 KB) under host memory fragmentation, and + * since MSI-X is advertised as the only capability, a table offset past + * the actual BAR length would make pci_alloc_irq_vectors(...MSIX...) fail + * with no MSI fall-back in cfg_space. The emulated register regions top + * out at 0x30000 (GPU_ASYNC end); 0x38000/0x39000 sit above them, in + * separate 4 KB pages, and below the 0x40000 minimum BAR size. + */ +#define EMU_MSIX_TABLE_OFF 0x00038000 +#define EMU_MSIX_PBA_OFF 0x00039000 + +/* IRQ mode selector for the virtual device */ +enum emu_irq_mode { + EMU_IRQ_MODE_INTX = 0, + EMU_IRQ_MODE_MSI = 1, + EMU_IRQ_MODE_MSIX = 2, +}; + /** * struct emu_pci_host - Virtual PCI host bridge state - * @bridge: PCI host bridge allocated by pci_alloc_host_bridge() - * @bus: root PCI bus created during bus scan + * @bridge: PCI host bridge allocated by pci_alloc_host_bridge() + * @bus: root PCI bus created during bus scan * @cfg_space: emulated PCI configuration space (256 bytes) - * @bar: pointer to BAR0 memory allocation state - * @virq: virtual IRQ number for the emulated device - * @mem_res: memory resource describing BAR0 region + * @bar: pointer to BAR0 memory allocation state + * @virq: virtual IRQ number for the emulated device (legacy INTx slot) + * @mem_res: memory resource describing BAR0 region + * @irq_mode: which IRQ delivery mode the cfg space advertises + * @msi_domain: PCI-MSI domain attached to enumerated devices when irq_mode != INTX */ struct emu_pci_host { struct pci_host_bridge *bridge; struct pci_bus *bus; uint8_t cfg_space[EMU_PCI_CFG_SIZE]; - struct emu_bar0 *bar; /* pointer to BAR0 data */ - unsigned int virq; /* virtual IRQ number */ - struct resource mem_res; /* memory resource for BAR0 */ + struct emu_bar0 *bar; + unsigned int virq; + struct resource mem_res; + enum emu_irq_mode irq_mode; + struct irq_domain *msi_domain; }; /** * emu_pci_host_create - Create virtual PCI host bridge and scan bus - * @host: host bridge state structure (caller-allocated) - * @bar: initialized BAR0 memory allocation - * @virq: virtual IRQ number to assign to the emulated device + * @host: host bridge state structure (caller-allocated) + * @bar: initialized BAR0 memory allocation + * @virq: virtual IRQ number to assign to the emulated device + * (used for legacy INTx; ignored once MSI/MSI-X is negotiated) + * @irq_mode: which IRQ kind to advertise in cfg_space + * @msi_domain: PCI-MSI irq_domain to attach to enumerated devices when + * irq_mode is MSI or MSI-X; ignored for INTX * * Creates a PCI host bridge with custom pci_ops that emulate config space * for a device matching the datadev driver's PCI ID table. Scanning the @@ -80,7 +116,9 @@ struct emu_pci_host { */ int emu_pci_host_create(struct emu_pci_host *host, struct emu_bar0 *bar, - unsigned int virq); + unsigned int virq, + enum emu_irq_mode irq_mode, + struct irq_domain *msi_domain); /** * emu_pci_host_destroy - Tear down the virtual PCI host bridge diff --git a/tests/test_irq_modes.sh b/tests/test_irq_modes.sh index 9ccd7828..7f893700 100755 --- a/tests/test_irq_modes.sh +++ b/tests/test_irq_modes.sh @@ -34,6 +34,7 @@ set -uo pipefail DEV="${DEV:-/dev/datadev_0}" APP_BIN="${APP_BIN:-data_dev/app/bin}" DATADEV_KO="${DATADEV_KO:-data_dev/driver/datadev.ko}" +EMULATOR_KO="${EMULATOR_KO:-emulator/driver/datadev_emulator.ko}" SIZE="${SIZE:-32768}" TIMEOUT_SEC="${TIMEOUT_SEC:-15}" INSMOD_TIMEOUT_SEC="${INSMOD_TIMEOUT_SEC:-120}" @@ -64,11 +65,16 @@ echo "=== IRQ behavior sweep test ===" # run_irq_cycle LABEL [insmod_params...] # Reloads datadev with the given parameters, runs a short dmaLoopTest, checks -# PRBS and dmesg. Sets CYCLE_FAIL=1 on any error. +# PRBS and dmesg. Sets CYCLE_FAIL=1 on any error. Also stashes the most +# recent "Probe: using interrupts" line in CYCLE_PROBE_LINE so the +# outer mode sweep can verify the cascade selection without doing an extra +# insmod (which would compound DMA-buffer allocation pressure across the +# 3-mode sweep and trip the cell's later load/unload tests with -ENOMEM). run_irq_cycle() { local label="$1" shift local insmod_params="$*" + CYCLE_PROBE_LINE="" echo "--- IRQ sub-test: $label ---" DMESG_BEFORE=$($SUDO dmesg | wc -l) @@ -97,6 +103,15 @@ run_irq_cycle() { fi $SUDO chmod 666 "$DEV" + # Capture the "Probe: using interrupts" line BEFORE dmaLoopTest + # runs with cfgDebug=1 -- the per-frame Process/MapReturn lines flood + # dmesg fast enough to evict the probe line out of any reasonable tail + # window within ~5 seconds. The outer mode-sweep reads this back via + # CYCLE_PROBE_LINE. + CYCLE_PROBE_LINE=$($SUDO dmesg | tail -n 80 | \ + grep -oE 'Probe: using [A-Za-z0-9-]+([ -]+[A-Za-z]+)* interrupts' | \ + tail -1 || echo "") + sleep 2 # Run dmaLoopTest; let it run until timeout (rc=124 ok). @@ -176,13 +191,105 @@ run_irq_subtest() { return 1 } -run_irq_subtest "cfgIrqHold=1" cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=1 cfgDebug=1 +# ---------------------------------------------------------------------------- +# Outer sweep: emu_irq_mode in {intx, msi, msix}. +# +# For each emulator IRQ mode, reload datadev_emulator with the requested mode +# (so cfg_space advertises the matching capability), then run the existing +# three cfgIrqHold/polled subtests against that emulator. After the +# subtests, scrape dmesg to assert datadev's probe selected the correct +# cascade branch -- a regression in data_dev_top.c's pci_alloc_irq_vectors +# call would silently land on a different branch and pass the cfg subtests +# but fail the dmesg gate below. +# +# Hosts without a built emulator (EMULATOR_KO missing) fall back to the +# legacy single-pass behavior against whichever emulator is already loaded. +# This keeps the script useful for hardware-attached test rigs where the +# real datadev hardware can't be hot-swapped between modes. +# ---------------------------------------------------------------------------- + +run_three_irq_subtests() { + local mode_label="$1" + run_irq_subtest "${mode_label}/cfgIrqHold=1" \ + cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=1 cfgDebug=1 + run_irq_subtest "${mode_label}/cfgIrqHold=100000" \ + cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=100000 cfgDebug=1 + run_irq_subtest "${mode_label}/polled" \ + cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqDis=1 cfgDebug=1 +} + +# Map an emu_irq_mode value to the dmesg substring data_dev_top.c logs at +# probe time. Used by the per-mode dmesg gate. +expected_probe_line() { + case "$1" in + intx) printf 'using legacy INTx interrupts' ;; + msi) printf 'using MSI interrupts' ;; + msix) printf 'using MSI-X interrupts' ;; + *) printf 'using ?' ;; + esac +} -# Sub-test 2: heavy IRQ coalescing (cfgIrqHold=100000). -run_irq_subtest "cfgIrqHold=100000" cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqHold=100000 cfgDebug=1 +if [ "${GPU_ENABLED:-0}" = "1" ]; then + # GPU CI phase: skip the IRQ-mode sweep. The GPU phase exists to exercise + # the GPU-async path; reloading the emulator three times to flip MSI/MSI-X + # caps adds ~3x runtime to that already-long phase without exercising any + # GPU-specific code path. The CPU phase runs the full sweep on the same + # emulator binary, so MSI/MSI-X coverage is preserved. + echo "=== GPU_ENABLED=1; running legacy single-mode IRQ test (mode sweep is CPU-phase-only) ===" + run_three_irq_subtests "current" +elif [ -f "$EMULATOR_KO" ]; then + echo "=== Emulator detected ($EMULATOR_KO); sweeping emu_irq_mode ===" + for IRQ_MODE in intx msi msix; do + echo "============================================================" + echo " emu_irq_mode=${IRQ_MODE}" + echo "============================================================" -# Sub-test 3: polled mode (cfgIrqDis=1). -run_irq_subtest "polled" cfgTxCount=64 cfgRxCount=64 cfgSize=65536 cfgIrqDis=1 cfgDebug=1 + # Reload emulator with the requested mode. The stub stays loaded + # because the emulator depends on its symbols. + $SUDO rmmod datadev 2>/dev/null || true + $SUDO rmmod datadev_emulator 2>/dev/null || true + for _ in $(seq 1 15); do [ ! -e "$DEV" ] && break; sleep 0.5; done + + if ! timeout --kill-after=5s "${INSMOD_TIMEOUT_SEC}s" \ + $SUDO insmod "$EMULATOR_KO" emu_irq_mode="$IRQ_MODE"; then + echo "[FAIL] could not load emulator with emu_irq_mode=$IRQ_MODE" + FAILED=$((FAILED + 1)) + continue + fi + + # Run the existing three subtests; they reload datadev each cycle + # and (via run_irq_cycle) stash the post-probe dmesg snapshot in + # CYCLE_PROBE_LINE before dmaLoopTest spam can evict it. + FAIL_BEFORE=$FAILED + run_three_irq_subtests "$IRQ_MODE" + + # Mode-selection assertion. Use the probe line captured during the + # last inner subtest (dev_info renders as "datadev 0000:bb:dd.f: + # Init: Probe: using interrupts, irq=N"; we stripped to just + # the "Probe: using interrupts" middle phrase). + # + # Only run this gate when all three subtests passed. If a subtest + # failed it has already been counted, and CYCLE_PROBE_LINE may be + # empty/stale from an aborted cycle -- asserting on it here would + # double-count the failure with a misleading "probe line mismatch". + # When the subtests pass, this still catches a wrong-branch cascade + # (dataplane OK but datadev picked the wrong interrupt kind). + EXPECTED=$(expected_probe_line "$IRQ_MODE") + if [ "$FAILED" -ne "$FAIL_BEFORE" ]; then + echo "[SKIP] irq_modes ${IRQ_MODE} -- probe-line assertion skipped (a subtest already failed this mode)" + elif [ "$CYCLE_PROBE_LINE" = "Probe: ${EXPECTED}" ]; then + echo "[PASS] irq_modes ${IRQ_MODE} -- datadev selected: ${EXPECTED}" + else + echo "[FAIL] irq_modes ${IRQ_MODE} -- captured probe line mismatch:" + echo " expected: 'Probe: ${EXPECTED}'" + echo " captured: '${CYCLE_PROBE_LINE}'" + FAILED=$((FAILED + 1)) + fi + done +else + echo "=== Emulator not found ($EMULATOR_KO); legacy single-mode pass ===" + run_three_irq_subtests "current" +fi # --- Cleanup: restore default parameters --- $SUDO rmmod datadev 2>/dev/null || true