From 92f86bff088dc6f0c0ed93b8e82d4d2459c35145 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 13 Mar 2018 11:17:29 -0600 Subject: vfio/common: cleanup in vfio_region_finalize Signed-off-by: Gerd Hoffmann Reviewed by: Kirti Wankhede Signed-off-by: Alex Williamson --- hw/vfio/common.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'hw/vfio/common.c') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index f895e3c335..6a8203a532 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -858,6 +858,13 @@ void vfio_region_finalize(VFIORegion *region) g_free(region->mmaps); trace_vfio_region_finalize(region->vbasedev->name, region->nr); + + region->mem = NULL; + region->mmaps = NULL; + region->nr_mmaps = 0; + region->size = 0; + region->flags = 0; + region->nr = 0; } void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) -- cgit v1.2.3-55-g7522 From 567b5b309abe744b1098018a2eb157e7109c9f30 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 13 Mar 2018 11:17:30 -0600 Subject: vfio/pci: Relax DMA map errors for MMIO regions At the moment if vfio_memory_listener is registered in the system memory address space, it maps/unmaps every RAM memory region for DMA. It expects system page size aligned memory sections so vfio_dma_map would not fail and so far this has been the case. A mapping failure would be fatal. A side effect of such behavior is that some MMIO pages would not be mapped silently. However we are going to change MSIX BAR handling so we will end having non-aligned sections in vfio_memory_listener (more details is in the next patch) and vfio_dma_map will exit QEMU. In order to avoid fatal failures on what previously was not a failure and was just silently ignored, this checks the section alignment to the smallest supported IOMMU page size and prints an error if not aligned; it also prints an error if vfio_dma_map failed despite the page size check. Both errors are not fatal; only MMIO RAM regions are checked (aka "RAM device" regions). If the amount of errors printed is overwhelming, the MSIX relocation could be used to avoid excessive error output. This is unlikely to cause any behavioral change. Signed-off-by: Alexey Kardashevskiy [aw: Fix Int128 bit ops] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) (limited to 'hw/vfio/common.c') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6a8203a532..07c03d78b6 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -544,18 +544,40 @@ static void vfio_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + + if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { + error_report("Region 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx + " is not aligned to 0x%"HWADDR_PRIx + " and cannot be mapped for DMA", + section->offset_within_region, + int128_getlo(section->size), + pgmask + 1); + return; + } + } + ret = vfio_dma_map(container, iova, int128_get64(llsize), vaddr, section->readonly); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%m)", container, iova, int128_get64(llsize), vaddr, ret); + if (memory_region_is_ram_device(section->mr)) { + /* Allow unexpected mappings not to be fatal for RAM devices */ + return; + } goto fail; } return; fail: + if (memory_region_is_ram_device(section->mr)) { + error_report("failed to vfio_dma_map. pci p2p may not work"); + return; + } /* * On the initfn path, store the first error in the container so we * can gracefully fail. Runtime, there's not much we can do other @@ -577,6 +599,7 @@ static void vfio_listener_region_del(MemoryListener *listener, hwaddr iova, end; Int128 llend, llsize; int ret; + bool try_unmap = true; if (vfio_listener_skipped_section(section)) { trace_vfio_listener_region_del_skip( @@ -629,14 +652,34 @@ static void vfio_listener_region_del(MemoryListener *listener, trace_vfio_listener_region_del(iova, end); - ret = vfio_dma_unmap(container, iova, int128_get64(llsize)); - memory_region_unref(section->mr); - if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%m)", - container, iova, int128_get64(llsize), ret); + if (memory_region_is_ram_device(section->mr)) { + hwaddr pgmask; + VFIOHostDMAWindow *hostwin; + bool hostwin_found = false; + + QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { + hostwin_found = true; + break; + } + } + assert(hostwin_found); /* or region_add() would have failed */ + + pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; + try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } + + if (try_unmap) { + ret = vfio_dma_unmap(container, iova, int128_get64(llsize)); + if (ret) { + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iova, int128_get64(llsize), ret); + } } + memory_region_unref(section->mr); + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { vfio_spapr_remove_window(container, section->offset_within_address_space); -- cgit v1.2.3-55-g7522 From ae0215b2bb56a9d5321a185dde133bfdd306a4c0 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 13 Mar 2018 11:17:31 -0600 Subject: vfio-pci: Allow mmap of MSIX BAR At the moment we unconditionally avoid mapping MSIX data of a BAR and emulate MSIX table in QEMU. However it is 1) not always necessary as a platform may provide a paravirt interface for MSIX configuration; 2) can affect the speed of MMIO access by emulating them in QEMU when frequently accessed registers share same system page with MSIX data, this is particularly a problem for systems with the page size bigger than 4KB. A new capability - VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - has been added to the kernel [1] which tells the userspace that mapping of the MSIX data is possible now. This makes use of it so from now on QEMU tries mapping the entire BAR as a whole and emulate MSIX on top of that. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6 Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- hw/vfio/common.c | 15 +++++++++++++++ hw/vfio/pci.c | 9 +++++++++ include/hw/vfio/vfio-common.h | 1 + 3 files changed, 25 insertions(+) (limited to 'hw/vfio/common.c') diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 07c03d78b6..5e84716218 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1471,6 +1471,21 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, return -ENODEV; } +bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) +{ + struct vfio_region_info *info = NULL; + bool ret = false; + + if (!vfio_get_region_info(vbasedev, region, &info)) { + if (vfio_get_region_info_cap(info, cap_type)) { + ret = true; + } + g_free(info); + } + + return ret; +} + /* * Interfaces for IBM EEH (Enhanced Error Handling) */ diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b9d2c12b82..02974f4eb9 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1294,6 +1294,15 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) off_t start, end; VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region; + /* + * If the host driver allows mapping of a MSIX data, we are going to + * do map the entire BAR and emulate MSIX table on top of that. + */ + if (vfio_has_region_cap(&vdev->vbasedev, region->nr, + VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) { + return; + } + /* * We expect to find a single mmap covering the whole BAR, anything else * means it's either unsupported or already setup. diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c5efa32750..d9360148e6 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -193,6 +193,7 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, uint32_t subtype, struct vfio_region_info **info); +bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type); #endif extern const MemoryListener vfio_prereg_listener; -- cgit v1.2.3-55-g7522