Skip to content

Commit

Permalink
Merge tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio
Browse files Browse the repository at this point in the history
Pull vfio updates from Alex Williamson:

 - Extend vfio-pci 8-byte read/write support to include archs defining
   CONFIG_GENERIC_IOMAP, such as x86, and remove now extraneous #ifdefs
   around 64-bit accessors (Ramesh Thomas)

 - Update vfio-pci shadow ROM handling and allow cached ROM from setup
   data to be exposed as a functional ROM BAR region when available
   (Yunxiang Li)

 - Update nvgrace-gpu vfio-pci variant driver for new Grace Blackwell
   hardware, conditionalizing the uncached BAR workaround for previous
   generation hardware based on the presence of a flag in a new DVSEC
   capability, and include a delay during probe for link training to
   complete, a new requirement for GB devices (Ankit Agrawal)

* tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio:
  vfio/nvgrace-gpu: Add GB200 SKU to the devid table
  vfio/nvgrace-gpu: Check the HBM training and C2C link status
  vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM
  vfio/nvgrace-gpu: Read dvsec register to determine need for uncached resmem
  vfio/platform: check the bounds of read/write syscalls
  vfio/pci: Expose setup ROM at ROM bar when needed
  vfio/pci: Remove shadow ROM specific code paths
  vfio/pci: Remove #ifdef iowrite64 and #ifdef ioread64
  vfio/pci: Enable iowrite64 and ioread64 for vfio pci
  • Loading branch information
torvalds committed Jan 28, 2025
2 parents 2ab002c + 2bb4475 commit 3673f5b
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 69 deletions.
169 changes: 147 additions & 22 deletions drivers/vfio/pci/nvgrace-gpu/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include <linux/sizes.h>
#include <linux/vfio_pci_core.h>
#include <linux/delay.h>
#include <linux/jiffies.h>

/*
* The device memory usable to the workloads running in the VM is cached
Expand All @@ -17,12 +19,21 @@
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX

/* Memory size expected as non cached and reserved by the VM driver */
#define RESMEM_SIZE SZ_1G

/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
#define MEMBLK_SIZE SZ_512M

#define DVSEC_BITMAP_OFFSET 0xA
#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)

#define GPU_CAP_DVSEC_REGISTER 3

#define C2C_LINK_BAR0_OFFSET 0x1498
#define HBM_TRAINING_BAR0_OFFSET 0x200BC
#define STATUS_READY 0xFF

#define POLL_QUANTUM_MS 1000
#define POLL_TIMEOUT_MS (30 * 1000)

/*
* The state of the two device memory region - resmem and usemem - is
* saved as struct mem_region.
Expand All @@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
struct mem_region resmem;
/* Lock to control device memory kernel mapping */
struct mutex remap_lock;
bool has_mig_hw_bug;
};

static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
Expand All @@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
if (index == USEMEM_REGION_INDEX)
return &nvdev->usemem;

if (index == RESMEM_REGION_INDEX)
if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
return &nvdev->resmem;

return NULL;
Expand Down Expand Up @@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
u64 memphys, u64 memlength)
{
int ret = 0;
u64 resmem_size = 0;

/*
* The VM GPU device driver needs a non-cacheable region to support
* the MIG feature. Since the device memory is mapped as NORMAL cached,
* carve out a region from the end with a different NORMAL_NC
* property (called as reserved memory and represented as resmem). This
* region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
* exposing the rest (termed as usable memory and represented using usemem)
* as cacheable 64b BAR (region 4 and 5).
* On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
* region to support the MIG feature owing to a hardware bug. Since the
* device memory is mapped as NORMAL cached, carve out a region from the end
* with a different NORMAL_NC property (called as reserved memory and
* represented as resmem). This region then is exposed as a 64b BAR
* (region 2 and 3) to the VM, while exposing the rest (termed as usable
* memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
*
* devmem (memlength)
* |-------------------------------------------------|
* | |
* usemem.memphys resmem.memphys
*
* This hardware bug is fixed on the Grace Blackwell platforms and the
* presence of the bug can be determined through nvdev->has_mig_hw_bug.
* Thus on systems with the hardware fix, there is no need to partition
* the GPU device memory and the entire memory is usable and mapped as
* NORMAL cached (i.e. resmem size is 0).
*/
if (nvdev->has_mig_hw_bug)
resmem_size = SZ_1G;

nvdev->usemem.memphys = memphys;

/*
* The device memory exposed to the VM is added to the kernel by the
* VM driver module in chunks of memory block size. Only the usable
* memory (usemem) is added to the kernel for usage by the VM
* workloads. Make the usable memory size memblock aligned.
* VM driver module in chunks of memory block size. Note that only the
* usable memory (usemem) is added to the kernel for usage by the VM
* workloads.
*/
if (check_sub_overflow(memlength, RESMEM_SIZE,
if (check_sub_overflow(memlength, resmem_size,
&nvdev->usemem.memlength)) {
ret = -EOVERFLOW;
goto done;
}

/*
* The USEMEM part of the device memory has to be MEMBLK_SIZE
* aligned. This is a hardwired ABI value between the GPU FW and
* VFIO driver. The VM device driver is also aware of it and make
* use of the value for its calculation to determine USEMEM size.
* The usemem region is exposed as a 64B Bar composed of region 4 and 5.
* Calculate and save the BAR size for the region.
*/
nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);

/*
* If the hardware has the fix for MIG, there is no requirement
* for splitting the device memory to create RESMEM. The entire
* device memory is usable and will be USEMEM. Return here for
* such case.
*/
if (!nvdev->has_mig_hw_bug)
goto done;

/*
* When the device memory is split to workaround the MIG bug on
* Grace Hopper, the USEMEM part of the device memory has to be
* MEMBLK_SIZE aligned. This is a hardwired ABI value between the
* GPU FW and VFIO driver. The VM device driver is also aware of it
* and make use of the value for its calculation to determine USEMEM
* size. Note that the device memory may not be 512M aligned.
*/
nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
MEMBLK_SIZE);
Expand All @@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
}

/*
* The memory regions are exposed as BARs. Calculate and save
* the BAR size for them.
* The resmem region is exposed as a 64b BAR composed of region 2 and 3
* for Grace Hopper. Calculate and save the BAR size for the region.
*/
nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
done:
return ret;
}

static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
{
int pcie_dvsec;
u16 dvsec_ctrl16;

pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
GPU_CAP_DVSEC_REGISTER);

if (pcie_dvsec) {
pci_read_config_word(pdev,
pcie_dvsec + DVSEC_BITMAP_OFFSET,
&dvsec_ctrl16);

if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
return false;
}

return true;
}

/*
* To reduce the system bootup time, the HBM training has
* been moved out of the UEFI on the Grace-Blackwell systems.
*
* The onus of checking whether the HBM training has completed
* thus falls on the module. The HBM training status can be
* determined from a BAR0 register.
*
* Similarly, another BAR0 register exposes the status of the
* CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
*
* Poll these register and check for 30s. If the HBM training is
* not complete or if the C2C link is not ready, fail the probe.
*
* While the wait is not required on Grace Hopper systems, it
* is beneficial to make the check to ensure the device is in an
* expected state.
*
* Ensure that the BAR0 region is enabled before accessing the
* registers.
*/
static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
{
unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
void __iomem *io;
int ret = -ETIME;

ret = pci_enable_device(pdev);
if (ret)
return ret;

ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
if (ret)
goto request_region_exit;

io = pci_iomap(pdev, 0, 0);
if (!io) {
ret = -ENOMEM;
goto iomap_exit;
}

do {
if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
(ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
ret = 0;
goto reg_check_exit;
}
msleep(POLL_QUANTUM_MS);
} while (!time_after(jiffies, timeout));

reg_check_exit:
pci_iounmap(pdev, io);
iomap_exit:
pci_release_selected_regions(pdev, 1 << 0);
request_region_exit:
pci_disable_device(pdev);
return ret;
}

static int nvgrace_gpu_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
Expand All @@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
u64 memphys, memlength;
int ret;

ret = nvgrace_gpu_wait_device_ready(pdev);
if (ret)
return ret;

ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
if (!ret)
ops = &nvgrace_gpu_pci_ops;
Expand All @@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
dev_set_drvdata(&pdev->dev, &nvdev->core_device);

if (ops == &nvgrace_gpu_pci_ops) {
nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);

/*
* Device memory properties are identified in the host ACPI
* table. Set the nvgrace_gpu_pci_core_device structure.
Expand Down Expand Up @@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
/* GH200 SKU */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
/* GB200 SKU */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
{}
};

Expand Down
8 changes: 4 additions & 4 deletions drivers/vfio/pci/vfio_pci_config.c
Original file line number Diff line number Diff line change
Expand Up @@ -511,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
IORESOURCE_ROM_SHADOW) {
mask = ~(0x20000 - 1);
} else if (pdev->rom && pdev->romlen) {
mask = ~(roundup_pow_of_two(pdev->romlen) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
} else
} else {
*vbar = 0;
}

vdev->bardirty = false;
}
Expand Down
40 changes: 18 additions & 22 deletions drivers/vfio/pci/vfio_pci_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1054,31 +1054,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,

info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.flags = 0;
info.size = 0;

/* Report the BAR size, not the ROM size */
info.size = pci_resource_len(pdev, info.index);
if (!info.size) {
/* Shadow ROMs appear as PCI option ROMs */
if (pdev->resource[PCI_ROM_RESOURCE].flags &
IORESOURCE_ROM_SHADOW)
info.size = 0x20000;
else
break;
}

/*
* Is it really there? Enable memory decode for implicit access
* in pci_map_rom().
*/
cmd = vfio_pci_memory_lock_and_enable(vdev);
io = pci_map_rom(pdev, &size);
if (io) {
if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
/*
* Check ROM content is valid. Need to enable memory
* decode for ROM access in pci_map_rom().
*/
cmd = vfio_pci_memory_lock_and_enable(vdev);
io = pci_map_rom(pdev, &size);
if (io) {
info.flags = VFIO_REGION_INFO_FLAG_READ;
/* Report the BAR size, not the ROM size. */
info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
pci_unmap_rom(pdev, io);
}
vfio_pci_memory_unlock_and_restore(vdev, cmd);
} else if (pdev->rom && pdev->romlen) {
info.flags = VFIO_REGION_INFO_FLAG_READ;
pci_unmap_rom(pdev, io);
} else {
info.size = 0;
/* Report BAR size as power of two. */
info.size = roundup_pow_of_two(pdev->romlen);
}
vfio_pci_memory_unlock_and_restore(vdev, cmd);

break;
}
Expand Down
Loading

0 comments on commit 3673f5b

Please sign in to comment.