1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00

VFIO updates for v6.14-rc1

- Extend vfio-pci 8-byte read/write support to include archs defining
    CONFIG_GENERIC_IOMAP, such as x86, and remove now extraneous #ifdefs
    around 64-bit accessors. (Ramesh Thomas)
 
  - Update vfio-pci shadow ROM handling and allow cached ROM from setup
    data to be exposed as a functional ROM BAR region when available.
    (Yunxiang Li)
 
  - Update nvgrace-gpu vfio-pci variant driver for new Grace Blackwell
    hardware, conditionalizing the uncached BAR workaround for previous
    generation hardware based on the presence of a flag in a new DVSEC
    capability, and include a delay during probe for link training to
    complete, a new requirement for GB devices. (Ankit Agrawal)
 -----BEGIN PGP SIGNATURE-----
 
 iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmeZNf0bHGFsZXgud2ls
 bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsiZDIP/16zZ6FwPxu52erigZCN
 6GwniPU051kDEKMhlO9aGSJNXkzB3qGYUXH1rExIYazRrovc9QYRG1JXBEGIj+Dj
 NkxhwJBvaj61dc1p+lfNH/jZYipE+mbuguz1gOZfwMEA/8uNmsFd2uhXBZBehI2X
 WCQQdxwz9GWB34pabN2OuR3cFwYbYD06kg/WfxsAEisDhRsjrPn//fbwZNBM7mLy
 4gmatoQ97uPexo+SACQSIIop7TlNeiA+Mo8i/XxTpmjry9Wl0tMNBKfNaxMf7o5p
 4laRU6EyT0/Cimc1w8Mct96fvO1AqKIRnBqFYwxzmtYthllpKPqnoZlwOPSE24f7
 zbB46NkhdE6JOsqJUMPj+hdW3bBhQgcpIMU3MkYgbzNVjcb5DcIDZk3b64DIJOqK
 HzItxvUNXVo9HYnc1gdI88c2btDA1hDOzH5fFX85AmQcUqs24+i7ekdEyws65J0O
 iVBJP/cC51vAJv0y4gtty+bq1OqcQ3jwnEvre52F9LPJVHFsKA8RheOyodlG0Ar8
 m1zWJVZbQIFbs8gp+q/GHdltQ9w0XvECQOe1EE7zxAQX0noks+3S+Ea+wAYYZH5p
 a1fbep0MoL3fZF+s4a7kc/avcm1WRpQTSY10HC3K/+0wQ5S0B8n/QG4S9mMsEEBn
 G/9+7ELvYrFop/CfC4Mkj0MA
 =NJZ8
 -----END PGP SIGNATURE-----

Merge tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio

Pull vfio updates from Alex Williamson:

 - Extend vfio-pci 8-byte read/write support to include archs defining
   CONFIG_GENERIC_IOMAP, such as x86, and remove now extraneous #ifdefs
   around 64-bit accessors (Ramesh Thomas)

 - Update vfio-pci shadow ROM handling and allow cached ROM from setup
   data to be exposed as a functional ROM BAR region when available
   (Yunxiang Li)

 - Update nvgrace-gpu vfio-pci variant driver for new Grace Blackwell
   hardware, conditionalizing the uncached BAR workaround for previous
   generation hardware based on the presence of a flag in a new DVSEC
   capability, and include a delay during probe for link training to
   complete, a new requirement for GB devices (Ankit Agrawal)

* tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio:
  vfio/nvgrace-gpu: Add GB200 SKU to the devid table
  vfio/nvgrace-gpu: Check the HBM training and C2C link status
  vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM
  vfio/nvgrace-gpu: Read dvsec register to determine need for uncached resmem
  vfio/platform: check the bounds of read/write syscalls
  vfio/pci: Expose setup ROM at ROM bar when needed
  vfio/pci: Remove shadow ROM specific code paths
  vfio/pci: Remove #ifdef iowrite64 and #ifdef ioread64
  vfio/pci: Enable iowrite64 and ioread64 for vfio pci
This commit is contained in:
Linus Torvalds 2025-01-28 14:16:46 -08:00
commit 3673f5be0e
5 changed files with 196 additions and 69 deletions

View file

@ -5,6 +5,8 @@
#include <linux/sizes.h>
#include <linux/vfio_pci_core.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
/*
* The device memory usable to the workloads running in the VM is cached
@ -17,12 +19,21 @@
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
/* Memory size expected as non cached and reserved by the VM driver */
#define RESMEM_SIZE SZ_1G
/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
#define MEMBLK_SIZE SZ_512M
#define DVSEC_BITMAP_OFFSET 0xA
#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
#define GPU_CAP_DVSEC_REGISTER 3
#define C2C_LINK_BAR0_OFFSET 0x1498
#define HBM_TRAINING_BAR0_OFFSET 0x200BC
#define STATUS_READY 0xFF
#define POLL_QUANTUM_MS 1000
#define POLL_TIMEOUT_MS (30 * 1000)
/*
* The state of the two device memory region - resmem and usemem - is
* saved as struct mem_region.
@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
struct mem_region resmem;
/* Lock to control device memory kernel mapping */
struct mutex remap_lock;
bool has_mig_hw_bug;
};
static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
if (index == USEMEM_REGION_INDEX)
return &nvdev->usemem;
if (index == RESMEM_REGION_INDEX)
if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
return &nvdev->resmem;
return NULL;
@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
u64 memphys, u64 memlength)
{
int ret = 0;
u64 resmem_size = 0;
/*
* The VM GPU device driver needs a non-cacheable region to support
* the MIG feature. Since the device memory is mapped as NORMAL cached,
* carve out a region from the end with a different NORMAL_NC
* property (called as reserved memory and represented as resmem). This
* region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
* exposing the rest (termed as usable memory and represented using usemem)
* as cacheable 64b BAR (region 4 and 5).
* On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
* region to support the MIG feature owing to a hardware bug. Since the
* device memory is mapped as NORMAL cached, carve out a region from the end
* with a different NORMAL_NC property (called as reserved memory and
* represented as resmem). This region then is exposed as a 64b BAR
* (region 2 and 3) to the VM, while exposing the rest (termed as usable
* memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
*
* devmem (memlength)
* |-------------------------------------------------|
* | |
* usemem.memphys resmem.memphys
*
* This hardware bug is fixed on the Grace Blackwell platforms and the
* presence of the bug can be determined through nvdev->has_mig_hw_bug.
* Thus on systems with the hardware fix, there is no need to partition
* the GPU device memory and the entire memory is usable and mapped as
* NORMAL cached (i.e. resmem size is 0).
*/
if (nvdev->has_mig_hw_bug)
resmem_size = SZ_1G;
nvdev->usemem.memphys = memphys;
/*
* The device memory exposed to the VM is added to the kernel by the
* VM driver module in chunks of memory block size. Only the usable
* memory (usemem) is added to the kernel for usage by the VM
* workloads. Make the usable memory size memblock aligned.
* VM driver module in chunks of memory block size. Note that only the
* usable memory (usemem) is added to the kernel for usage by the VM
* workloads.
*/
if (check_sub_overflow(memlength, RESMEM_SIZE,
if (check_sub_overflow(memlength, resmem_size,
&nvdev->usemem.memlength)) {
ret = -EOVERFLOW;
goto done;
}
/*
* The USEMEM part of the device memory has to be MEMBLK_SIZE
* aligned. This is a hardwired ABI value between the GPU FW and
* VFIO driver. The VM device driver is also aware of it and make
* use of the value for its calculation to determine USEMEM size.
* The usemem region is exposed as a 64B Bar composed of region 4 and 5.
* Calculate and save the BAR size for the region.
*/
nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
/*
* If the hardware has the fix for MIG, there is no requirement
* for splitting the device memory to create RESMEM. The entire
* device memory is usable and will be USEMEM. Return here for
* such case.
*/
if (!nvdev->has_mig_hw_bug)
goto done;
/*
* When the device memory is split to workaround the MIG bug on
* Grace Hopper, the USEMEM part of the device memory has to be
* MEMBLK_SIZE aligned. This is a hardwired ABI value between the
* GPU FW and VFIO driver. The VM device driver is also aware of it
* and make use of the value for its calculation to determine USEMEM
* size. Note that the device memory may not be 512M aligned.
*/
nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
MEMBLK_SIZE);
@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
}
/*
* The memory regions are exposed as BARs. Calculate and save
* the BAR size for them.
* The resmem region is exposed as a 64b BAR composed of region 2 and 3
* for Grace Hopper. Calculate and save the BAR size for the region.
*/
nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
done:
return ret;
}
static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
{
int pcie_dvsec;
u16 dvsec_ctrl16;
pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
GPU_CAP_DVSEC_REGISTER);
if (pcie_dvsec) {
pci_read_config_word(pdev,
pcie_dvsec + DVSEC_BITMAP_OFFSET,
&dvsec_ctrl16);
if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
return false;
}
return true;
}
/*
* To reduce the system bootup time, the HBM training has
* been moved out of the UEFI on the Grace-Blackwell systems.
*
* The onus of checking whether the HBM training has completed
* thus falls on the module. The HBM training status can be
* determined from a BAR0 register.
*
* Similarly, another BAR0 register exposes the status of the
* CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
*
* Poll these register and check for 30s. If the HBM training is
* not complete or if the C2C link is not ready, fail the probe.
*
* While the wait is not required on Grace Hopper systems, it
* is beneficial to make the check to ensure the device is in an
* expected state.
*
* Ensure that the BAR0 region is enabled before accessing the
* registers.
*/
static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
{
unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
void __iomem *io;
int ret = -ETIME;
ret = pci_enable_device(pdev);
if (ret)
return ret;
ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
if (ret)
goto request_region_exit;
io = pci_iomap(pdev, 0, 0);
if (!io) {
ret = -ENOMEM;
goto iomap_exit;
}
do {
if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
(ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
ret = 0;
goto reg_check_exit;
}
msleep(POLL_QUANTUM_MS);
} while (!time_after(jiffies, timeout));
reg_check_exit:
pci_iounmap(pdev, io);
iomap_exit:
pci_release_selected_regions(pdev, 1 << 0);
request_region_exit:
pci_disable_device(pdev);
return ret;
}
static int nvgrace_gpu_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
u64 memphys, memlength;
int ret;
ret = nvgrace_gpu_wait_device_ready(pdev);
if (ret)
return ret;
ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
if (!ret)
ops = &nvgrace_gpu_pci_ops;
@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
dev_set_drvdata(&pdev->dev, &nvdev->core_device);
if (ops == &nvgrace_gpu_pci_ops) {
nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
/*
* Device memory properties are identified in the host ACPI
* table. Set the nvgrace_gpu_pci_core_device structure.
@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
/* GH200 SKU */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
/* GB200 SKU */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
{}
};

View file

@ -511,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
IORESOURCE_ROM_SHADOW) {
mask = ~(0x20000 - 1);
} else if (pdev->rom && pdev->romlen) {
mask = ~(roundup_pow_of_two(pdev->romlen) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
} else
} else {
*vbar = 0;
}
vdev->bardirty = false;
}

View file

@ -1054,31 +1054,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.flags = 0;
info.size = 0;
/* Report the BAR size, not the ROM size */
info.size = pci_resource_len(pdev, info.index);
if (!info.size) {
/* Shadow ROMs appear as PCI option ROMs */
if (pdev->resource[PCI_ROM_RESOURCE].flags &
IORESOURCE_ROM_SHADOW)
info.size = 0x20000;
else
break;
}
/*
* Is it really there? Enable memory decode for implicit access
* in pci_map_rom().
*/
cmd = vfio_pci_memory_lock_and_enable(vdev);
io = pci_map_rom(pdev, &size);
if (io) {
if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
/*
* Check ROM content is valid. Need to enable memory
* decode for ROM access in pci_map_rom().
*/
cmd = vfio_pci_memory_lock_and_enable(vdev);
io = pci_map_rom(pdev, &size);
if (io) {
info.flags = VFIO_REGION_INFO_FLAG_READ;
/* Report the BAR size, not the ROM size. */
info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
pci_unmap_rom(pdev, io);
}
vfio_pci_memory_unlock_and_restore(vdev, cmd);
} else if (pdev->rom && pdev->romlen) {
info.flags = VFIO_REGION_INFO_FLAG_READ;
pci_unmap_rom(pdev, io);
} else {
info.size = 0;
/* Report BAR size as power of two. */
info.size = roundup_pow_of_two(pdev->romlen);
}
vfio_pci_memory_unlock_and_restore(vdev, cmd);
break;
}

View file

@ -16,6 +16,7 @@
#include <linux/io.h>
#include <linux/vfio.h>
#include <linux/vgaarb.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include "vfio_pci_priv.h"
@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size);
VFIO_IOWRITE(8)
VFIO_IOWRITE(16)
VFIO_IOWRITE(32)
#ifdef iowrite64
VFIO_IOWRITE(64)
#endif
#define VFIO_IOREAD(size) \
int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \
@ -89,9 +88,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size);
VFIO_IOREAD(8)
VFIO_IOREAD(16)
VFIO_IOREAD(32)
#ifdef ioread64
VFIO_IOREAD(64)
#endif
#define VFIO_IORDWR(size) \
static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
@ -127,9 +124,7 @@ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
VFIO_IORDWR(8)
VFIO_IORDWR(16)
VFIO_IORDWR(32)
#if defined(ioread64) && defined(iowrite64)
VFIO_IORDWR(64)
#endif
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
@ -155,7 +150,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
else
fillable = 0;
#if defined(ioread64) && defined(iowrite64)
if (fillable >= 8 && !(off % 8)) {
ret = vfio_pci_iordwr64(vdev, iswrite, test_mem,
io, buf, off, &filled);
@ -163,7 +157,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
return ret;
} else
#endif
if (fillable >= 4 && !(off % 4)) {
ret = vfio_pci_iordwr32(vdev, iswrite, test_mem,
io, buf, off, &filled);
@ -244,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
if (pci_resource_start(pdev, bar))
end = pci_resource_len(pdev, bar);
else if (bar == PCI_ROM_RESOURCE &&
pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
end = 0x20000;
else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen)
end = roundup_pow_of_two(pdev->romlen);
else
return -EINVAL;
@ -261,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
* excluded range at the end of the actual ROM. This makes
* filling large ROM BARs much faster.
*/
io = pci_map_rom(pdev, &x_start);
if (!io) {
done = -ENOMEM;
goto out;
if (pci_resource_start(pdev, bar)) {
io = pci_map_rom(pdev, &x_start);
} else {
io = ioremap(pdev->rom, pdev->romlen);
x_start = pdev->romlen;
}
if (!io)
return -ENOMEM;
x_end = end;
} else {
int ret = vfio_pci_core_setup_barmap(vdev, bar);
@ -288,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
if (done >= 0)
*ppos += done;
if (bar == PCI_ROM_RESOURCE)
pci_unmap_rom(pdev, io);
if (bar == PCI_ROM_RESOURCE) {
if (pci_resource_start(pdev, bar))
pci_unmap_rom(pdev, io);
else
iounmap(io);
}
out:
return done;
}
@ -381,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
#ifdef iowrite64
case 8:
vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
#endif
}
}
@ -440,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
pos >= vdev->msix_offset + vdev->msix_size))
return -EINVAL;
#ifndef iowrite64
if (count == 8)
return -EINVAL;
#endif
ret = vfio_pci_core_setup_barmap(vdev, bar);
if (ret)

View file

@ -388,6 +388,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
if (off >= reg->size)
return -EINVAL;
count = min_t(size_t, count, reg->size - off);
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);
@ -467,6 +472,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
if (off >= reg->size)
return -EINVAL;
count = min_t(size_t, count, reg->size - off);
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);