From 21e00fa55f3fdfcbb20da7c6876c91ef3609b387 Mon Sep 17 00:00:00 2001
From: Alex Williamson
Date: Mon, 31 Oct 2016 09:53:03 -0600
Subject: memory: Replace skip_dump flag with "ram_device"

Setting skip_dump on a MemoryRegion allows us to modify one specific
code path, but the restriction we're trying to address encompasses
more than that.  If we have a RAM MemoryRegion backed by a physical
device, it not only restricts our ability to dump that region, but
also affects how we should manipulate it.  Here we recognize that
MemoryRegions do not change to sometimes allow dumps and other times
not, so we replace setting the skip_dump flag with a new initializer
so that we know exactly the type of region to which we're applying
this behavior.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/vfio/common.c | 9 ++++-----
 hw/vfio/spapr.c  | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'hw')

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 9505fb3040..c764cb3d22 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -724,12 +724,11 @@ int vfio_region_mmap(VFIORegion *region)
 
         name = g_strdup_printf("%s mmaps[%d]",
                                memory_region_name(region->mem), i);
-        memory_region_init_ram_ptr(&region->mmaps[i].mem,
-                                   memory_region_owner(region->mem),
-                                   name, region->mmaps[i].size,
-                                   region->mmaps[i].mmap);
+        memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
+                                          memory_region_owner(region->mem),
+                                          name, region->mmaps[i].size,
+                                          region->mmaps[i].mmap);
         g_free(name);
-        memory_region_set_skip_dump(&region->mmaps[i].mem);
         memory_region_add_subregion(region->mem, region->mmaps[i].offset,
                                     &region->mmaps[i].mem);
 
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 7443d348d9..4409bcc0d7 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -25,7 +25,7 @@ static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
     }
 
     return !memory_region_is_ram(section->mr) ||
-            memory_region_is_skip_dump(section->mr);
+            memory_region_is_ram_device(section->mr);
 }
 
 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
-- 
cgit v1.2.3-55-g7522


From 24acf72b9a291cebfd05f2ecdf3a982ac01e6291 Mon Sep 17 00:00:00 2001
From: Alex Williamson
Date: Mon, 31 Oct 2016 09:53:03 -0600
Subject: vfio: Handle zero-length sparse mmap ranges

As reported in the link below, user has a PCI device with a 4KB BAR
which contains the MSI-X table.  This seems to hit a corner case in
the kernel where the region reports being mmap capable, but the sparse
mmap information reports a zero sized range.  It's not entirely clear
that the kernel is incorrect in doing this, but regardless, we need
to handle it.  To do this, fill our mmap array only with non-zero
sized sparse mmap entries and add an error return from the function
so we can tell the difference between nr_mmaps being zero based on
sparse mmap info vs lack of sparse mmap info.

NB, this doesn't actually change the behavior of the device, it only
removes the scary "Failed to mmap ... Performance may be slow" error
message.  We cannot currently create an mmap over the MSI-X table.

Link: http://lists.nongnu.org/archive/html/qemu-discuss/2016-10/msg00009.html
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/common.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'hw')

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index c764cb3d22..f528309b81 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -610,16 +610,16 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
     return NULL;
 }
 
-static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
-                                           struct vfio_region_info *info)
+static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
+                                          struct vfio_region_info *info)
 {
     struct vfio_info_cap_header *hdr;
     struct vfio_region_info_cap_sparse_mmap *sparse;
-    int i;
+    int i, j;
 
     hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
     if (!hdr) {
-        return;
+        return -ENODEV;
     }
 
     sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
@@ -627,16 +627,24 @@ static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
     trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
                                          region->nr, sparse->nr_areas);
 
-    region->nr_mmaps = sparse->nr_areas;
-    region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
+    region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
 
-    for (i = 0; i < region->nr_mmaps; i++) {
-        region->mmaps[i].offset = sparse->areas[i].offset;
-        region->mmaps[i].size = sparse->areas[i].size;
-        trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset,
-                                            region->mmaps[i].offset +
-                                            region->mmaps[i].size);
+    for (i = 0, j = 0; i < sparse->nr_areas; i++) {
+        trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
+                                            sparse->areas[i].offset +
+                                            sparse->areas[i].size);
+
+        if (sparse->areas[i].size) {
+            region->mmaps[j].offset = sparse->areas[i].offset;
+            region->mmaps[j].size = sparse->areas[i].size;
+            j++;
+        }
     }
+
+    region->nr_mmaps = j;
+    region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
+
+    return 0;
 }
 
 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
@@ -665,9 +673,9 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
             region->flags & VFIO_REGION_INFO_FLAG_MMAP &&
             !(region->size & ~qemu_real_host_page_mask)) {
 
-            vfio_setup_region_sparse_mmaps(region, info);
+            ret = vfio_setup_region_sparse_mmaps(region, info);
 
-            if (!region->nr_mmaps) {
+            if (ret) {
                 region->nr_mmaps = 1;
                 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
                 region->mmaps[0].offset = 0;
-- 
cgit v1.2.3-55-g7522


From a52a4c471703e995ceb06f6157d70747823e8a0d Mon Sep 17 00:00:00 2001
From: Ido Yariv
Date: Mon, 31 Oct 2016 09:53:04 -0600
Subject: vfio/pci: fix out-of-sync BAR information on reset

When a PCI device is reset, pci_do_device_reset resets all BAR addresses
in the relevant PCIDevice's config buffer.

The VFIO configuration space stays untouched, so the guest OS may choose
to skip restoring the BAR addresses as they would seem intact. The PCI
device may be left non-operational.
One example of such a scenario is when the guest exits S3.

Fix this by resetting the BAR addresses in the VFIO configuration space
as well.

Signed-off-by: Ido Yariv <ido@wizery.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/pci.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'hw')

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 65d30fdef9..b399742058 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1922,11 +1922,23 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
 {
     Error *err = NULL;
+    int nr;
 
     vfio_intx_enable(vdev, &err);
     if (err) {
         error_reportf_err(err, ERR_PREFIX, vdev->vbasedev.name);
     }
+
+    for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
+        off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
+        uint32_t val = 0;
+        uint32_t len = sizeof(val);
+
+        if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
+            error_report("%s(%s) reset bar %d failed: %m", __func__,
+                         vdev->vbasedev.name, nr);
+        }
+    }
 }
 
 static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
-- 
cgit v1.2.3-55-g7522


From 95251725e335af2b885e2ab33dd29c86f8084663 Mon Sep 17 00:00:00 2001
From: Yongji Xie
Date: Mon, 31 Oct 2016 09:53:04 -0600
Subject: vfio: Add support for mmapping sub-page MMIO BARs

Now the kernel commit 05f0c03fbac1 ("vfio-pci: Allow to mmap
sub-page MMIO BARs if the mmio page is exclusive") allows VFIO
to mmap sub-page BARs. This is the corresponding QEMU patch.
With those patches applied, we could passthrough sub-page BARs
to guest, which can help to improve IO performance for some devices.

In this patch, we expand MemoryRegions of these sub-page
MMIO BARs to PAGE_SIZE in vfio_pci_write_config(), so that
the BARs could be passed to KVM ioctl KVM_SET_USER_MEMORY_REGION
with a valid size. The expanding size will be recovered when
the base address of sub-page BAR is changed and not page aligned
any more in guest. And we also set the priority of these BARs'
memory regions to zero in case of overlap with BARs which share
the same page with sub-page BARs in guest.

Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 hw/vfio/common.c |  3 +--
 hw/vfio/pci.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'hw')

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f528309b81..801578b4b9 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -670,8 +670,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
                               region, name, region->size);
 
         if (!vbasedev->no_mmap &&
-            region->flags & VFIO_REGION_INFO_FLAG_MMAP &&
-            !(region->size & ~qemu_real_host_page_mask)) {
+            region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
 
             ret = vfio_setup_region_sparse_mmaps(region, info);
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b399742058..d7dbe0e3e0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1070,6 +1070,55 @@ static const MemoryRegionOps vfio_vga_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
+/*
+ * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
+ * size if the BAR is in an exclusive page in host so that we could map
+ * this BAR to guest. But this sub-page BAR may not occupy an exclusive
+ * page in guest. So we should set the priority of the expanded memory
+ * region to zero in case of overlap with BARs which share the same page
+ * with the sub-page BAR in guest. Besides, we should also recover the
+ * size of this sub-page BAR when its base address is changed in guest
+ * and not page aligned any more.
+ */
+static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
+{
+    VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
+    VFIORegion *region = &vdev->bars[bar].region;
+    MemoryRegion *mmap_mr, *mr;
+    PCIIORegion *r;
+    pcibus_t bar_addr;
+    uint64_t size = region->size;
+
+    /* Make sure that the whole region is allowed to be mmapped */
+    if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
+        region->mmaps[0].size != region->size) {
+        return;
+    }
+
+    r = &pdev->io_regions[bar];
+    bar_addr = r->addr;
+    mr = region->mem;
+    mmap_mr = &region->mmaps[0].mem;
+
+    /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
+    if (bar_addr != PCI_BAR_UNMAPPED &&
+        !(bar_addr & ~qemu_real_host_page_mask)) {
+        size = qemu_real_host_page_size;
+    }
+
+    memory_region_transaction_begin();
+
+    memory_region_set_size(mr, size);
+    memory_region_set_size(mmap_mr, size);
+    if (size != region->size && memory_region_is_mapped(mr)) {
+        memory_region_del_subregion(r->address_space, mr);
+        memory_region_add_subregion_overlap(r->address_space,
+                                            bar_addr, mr, 0);
+    }
+
+    memory_region_transaction_commit();
+}
+
 /*
  * PCI config space
  */
@@ -1153,6 +1202,24 @@ void vfio_pci_write_config(PCIDevice *pdev,
         } else if (was_enabled && !is_enabled) {
             vfio_msix_disable(vdev);
         }
+    } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
+        range_covers_byte(addr, len, PCI_COMMAND)) {
+        pcibus_t old_addr[PCI_NUM_REGIONS - 1];
+        int bar;
+
+        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
+            old_addr[bar] = pdev->io_regions[bar].addr;
+        }
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
+            if (old_addr[bar] != pdev->io_regions[bar].addr &&
+                pdev->io_regions[bar].size > 0 &&
+                pdev->io_regions[bar].size < qemu_real_host_page_size) {
+                vfio_sub_page_bar_update_mapping(pdev, bar);
+            }
+        }
     } else {
         /* Write everything to QEMU to keep emulated bits correct */
         pci_default_write_config(pdev, addr, val, len);
-- 
cgit v1.2.3-55-g7522