From 50a6fa8f939a3448967592e5136afcc063f9a25d Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 24 Apr 2018 08:38:57 +0200 Subject: allocate pci id for mdpy mdpy is a sample pci device for vfio-mdev. Not (yet) merged upstream, patch available here: https://www.kraxel.org/cgit/linux/commit/?h=vfio-sample-display&id=6fd86cff3d7df38ab89625b16fdd6434b1c18749 Cc: Alex Williamson Signed-off-by: Gerd Hoffmann Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/pci/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/hw') diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index a9c3ee5aa2..990d6fcbde 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -101,6 +101,7 @@ extern bool pci_available; #define PCI_DEVICE_ID_REDHAT_PCIE_RP 0x000c #define PCI_DEVICE_ID_REDHAT_XHCI 0x000d #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e +#define PCI_DEVICE_ID_REDHAT_MDPY 0x000f #define PCI_DEVICE_ID_REDHAT_QXL 0x0100 #define FMT_PCIBUS PRIx64 -- cgit v1.2.3-55-g7522 From 6f80e6170ede13605817e5c0ca73db0de7bdf261 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 12 Apr 2018 23:12:30 +0800 Subject: virtio: support setting memory region based host notifier This patch introduces the support for setting memory region based host notifiers for virtio device. This is helpful when using a hardware accelerator for a virtio device, because hardware heavily depends on the notification, this will allow the guest driver in the VM to notify the hardware directly. Signed-off-by: Tiwei Bie Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-pci.c | 22 ++++++++++++++++++++++ hw/virtio/virtio.c | 13 +++++++++++++ include/hw/virtio/virtio-bus.h | 2 ++ include/hw/virtio/virtio.h | 2 ++ 4 files changed, 39 insertions(+) (limited to 'include/hw') diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1e8ab7bbc5..5eb0c323ca 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1037,6 +1037,27 @@ assign_error: return r; } +static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n, + MemoryRegion *mr, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + int offset; + + if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) || + virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) { + return -1; + } + + if (assign) { + offset = virtio_pci_queue_mem_mult(proxy) * n; + memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1); + } else { + memory_region_del_subregion(&proxy->notify.mr, mr); + } + + return 0; +} + static void virtio_pci_vmstate_change(DeviceState *d, bool running) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); @@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) k->has_extra_state = virtio_pci_has_extra_state; k->query_guest_notifiers = virtio_pci_query_guest_notifiers; k->set_guest_notifiers = virtio_pci_set_guest_notifiers; + k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr; k->vmstate_change = virtio_pci_vmstate_change; k->pre_plugged = virtio_pci_pre_plugged; k->device_plugged = virtio_pci_device_plugged; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 006d3d1148..1debb0147b 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) return &vq->host_notifier; } +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (k->set_host_notifier_mr) { + return k->set_host_notifier_mr(qbus->parent, n, mr, assign); + } + + return -1; +} + void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) { g_free(vdev->bus_name); diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h index ced3d2d2b0..7fec9dc929 100644 --- a/include/hw/virtio/virtio-bus.h +++ b/include/hw/virtio/virtio-bus.h @@ -52,6 +52,8 @@ typedef struct VirtioBusClass { bool (*has_extra_state)(DeviceState *d); bool (*query_guest_notifiers)(DeviceState *d); int (*set_guest_notifiers)(DeviceState *d, int nvqs, bool assign); + int (*set_host_notifier_mr)(DeviceState *d, int n, + MemoryRegion *mr, bool assign); void (*vmstate_change)(DeviceState *d, bool running); /* * Expose the features the transport layer supports before diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 098bdaaea3..9c1fa07d6d 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -239,6 +239,8 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align); void virtio_queue_notify(VirtIODevice *vdev, int n); uint16_t virtio_queue_vector(VirtIODevice *vdev, int n); void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector); +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign); int virtio_set_status(VirtIODevice *vdev, uint8_t val); void virtio_reset(void *opaque); void virtio_update_irq(VirtIODevice *vdev); -- cgit v1.2.3-55-g7522 From 1a97a478e69534f70ebbed1f27c315dbdc23f94f Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 21 May 2018 10:32:00 -0600 Subject: nvdimm: fix typo in label-size definition Signed-off-by: Ross Zwisler Fixes: commit da6789c27c2e ("nvdimm: add a macro for property "label-size"") Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov Cc: Haozhong Zhang Cc: Michael S. Tsirkin Cc: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/mem/nvdimm.c | 2 +- include/hw/mem/nvdimm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/hw') diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index acb656b672..4087aca25e 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp) static void nvdimm_init(Object *obj) { - object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", + object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); object_property_add_bool(obj, NVDIMM_UNARMED_PROP, diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 7fd87c4e1c..74c60332e1 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -48,7 +48,7 @@ #define NVDIMM_GET_CLASS(obj) OBJECT_GET_CLASS(NVDIMMClass, (obj), \ TYPE_NVDIMM) -#define NVDIMM_LABLE_SIZE_PROP "label-size" +#define NVDIMM_LABEL_SIZE_PROP "label-size" #define NVDIMM_UNARMED_PROP "unarmed" struct NVDIMMDevice { -- cgit v1.2.3-55-g7522 From b4a4ba0d68f50f218ee3957b6638dbee32a5eeef Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:10 +0800 Subject: intel-iommu: remove IntelIOMMUNotifierNode That is not really necessary. Removing that node struct and put the list entry directly into VTDAddressSpace. It simplfies the code a lot. Since at it, rename the old notifiers_list into vtd_as_with_notifiers. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 41 +++++++++++------------------------------ include/hw/i386/intel_iommu.h | 9 ++------- 2 files changed, 13 insertions(+), 37 deletions(-) (limited to 'include/hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index b359efd6f9..3df90457f8 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1248,10 +1248,10 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_iommu_replay_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; - QLIST_FOREACH(node, &s->notifiers_list, next) { - memory_region_iommu_replay_all(&node->vtd_as->iommu); + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + memory_region_iommu_replay_all(&vtd_as->iommu); } } @@ -1372,7 +1372,6 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) { - IntelIOMMUNotifierNode *node; VTDContextEntry ce; VTDAddressSpace *vtd_as; @@ -1381,8 +1380,7 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { @@ -1402,12 +1400,11 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; - QLIST_FOREACH(node, &(s->notifiers_list), next) { - VTDAddressSpace *vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { @@ -2344,8 +2341,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; - IntelIOMMUNotifierNode *node = NULL; - IntelIOMMUNotifierNode *next_node = NULL; if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { error_report("We need to set caching-mode=1 for intel-iommu to enable " @@ -2354,21 +2349,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, } if (old == IOMMU_NOTIFIER_NONE) { - node = g_malloc0(sizeof(*node)); - node->vtd_as = vtd_as; - QLIST_INSERT_HEAD(&s->notifiers_list, node, next); - return; - } - - /* update notifier node with new flags */ - QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) { - if (node->vtd_as == vtd_as) { - if (new == IOMMU_NOTIFIER_NONE) { - QLIST_REMOVE(node, next); - g_free(node); - } - return; - } + QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(vtd_as, next); } } @@ -2838,12 +2821,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) static void vtd_address_space_unmap_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; VTDAddressSpace *vtd_as; IOMMUNotifier *n; - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { vtd_address_space_unmap(vtd_as, n); } @@ -3088,7 +3069,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) return; } - QLIST_INIT(&s->notifiers_list); + QLIST_INIT(&s->vtd_as_with_notifiers); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 45ec8919b6..032e33bcb2 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -67,7 +67,6 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry; typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress; typedef struct VTDIrq VTDIrq; typedef struct VTD_MSIMessage VTD_MSIMessage; -typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode; /* Context-Entry */ struct VTDContextEntry { @@ -93,6 +92,7 @@ struct VTDAddressSpace { MemoryRegion iommu_ir; /* Interrupt region: 0xfeeXXXXX */ IntelIOMMUState *iommu_state; VTDContextCacheEntry context_cache_entry; + QLIST_ENTRY(VTDAddressSpace) next; }; struct VTDBus { @@ -253,11 +253,6 @@ struct VTD_MSIMessage { /* When IR is enabled, all MSI/MSI-X data bits should be zero */ #define VTD_IR_MSI_DATA (0) -struct IntelIOMMUNotifierNode { - VTDAddressSpace *vtd_as; - QLIST_ENTRY(IntelIOMMUNotifierNode) next; -}; - /* The iommu (DMAR) device state struct */ struct IntelIOMMUState { X86IOMMUState x86_iommu; @@ -295,7 +290,7 @@ struct IntelIOMMUState { GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by PCIBus* reference */ VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */ /* list of registered notifiers */ - QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list; + QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers; /* interrupt remapping */ bool intr_enabled; /* Whether guest enabled IR */ -- cgit v1.2.3-55-g7522 From 1d9efa73e12ddf361ea997c2d532cc4afa6674d1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:11 +0800 Subject: intel-iommu: add iommu lock SECURITY IMPLICATION: this patch fixes a potential race when multiple threads access the IOMMU IOTLB cache. Add a per-iommu big lock to protect IOMMU status. Currently the only thing to be protected is the IOTLB/context cache, since that can be accessed even without BQL, e.g., in IO dataplane. Note that we don't need to protect device page tables since that's fully controlled by the guest kernel. However there is still possibility that malicious drivers will program the device to not obey the rule. In that case QEMU can't really do anything useful, instead the guest itself will be responsible for all uncertainties. CC: QEMU Stable Reported-by: Fam Zheng Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 56 ++++++++++++++++++++++++++++++++++++------- include/hw/i386/intel_iommu.h | 6 +++++ 2 files changed, 53 insertions(+), 9 deletions(-) (limited to 'include/hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3df90457f8..8d4069de9f 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -128,6 +128,16 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, return new_val; } +static inline void vtd_iommu_lock(IntelIOMMUState *s) +{ + qemu_mutex_lock(&s->iommu_lock); +} + +static inline void vtd_iommu_unlock(IntelIOMMUState *s) +{ + qemu_mutex_unlock(&s->iommu_lock); +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -172,9 +182,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, } /* Reset all the gen of VTDAddressSpace to zero and set the gen of - * IntelIOMMUState to 1. + * IntelIOMMUState to 1. Must be called with IOMMU lock held. */ -static void vtd_reset_context_cache(IntelIOMMUState *s) +static void vtd_reset_context_cache_locked(IntelIOMMUState *s) { VTDAddressSpace *vtd_as; VTDBus *vtd_bus; @@ -197,12 +207,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s) s->context_cache_gen = 1; } -static void vtd_reset_iotlb(IntelIOMMUState *s) +/* Must be called with IOMMU lock held. */ +static void vtd_reset_iotlb_locked(IntelIOMMUState *s) { assert(s->iotlb); g_hash_table_remove_all(s->iotlb); } +static void vtd_reset_iotlb(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); +} + static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { @@ -215,6 +233,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; } +/* Must be called with IOMMU lock held */ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, hwaddr addr) { @@ -235,6 +254,7 @@ out: return entry; } +/* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint16_t domain_id, hwaddr addr, uint64_t slpte, uint8_t access_flags, uint32_t level) @@ -246,7 +266,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { trace_vtd_iotlb_reset("iotlb exceeds size limit"); - vtd_reset_iotlb(s); + vtd_reset_iotlb_locked(s); } entry->gfn = gfn; @@ -1106,7 +1126,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, IntelIOMMUState *s = vtd_as->iommu_state; VTDContextEntry ce; uint8_t bus_num = pci_bus_num(bus); - VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry; + VTDContextCacheEntry *cc_entry; uint64_t slpte, page_mask; uint32_t level; uint16_t source_id = vtd_make_source_id(bus_num, devfn); @@ -1123,6 +1143,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, */ assert(!vtd_is_interrupt_addr(addr)); + vtd_iommu_lock(s); + + cc_entry = &vtd_as->context_cache_entry; + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { @@ -1182,7 +1206,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, * IOMMU region can be swapped back. */ vtd_pt_enable_fast_path(s, source_id); - + vtd_iommu_unlock(s); return true; } @@ -1203,6 +1227,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, access_flags, level); out: + vtd_iommu_unlock(s); entry->iova = addr & page_mask; entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; @@ -1210,6 +1235,7 @@ out: return true; error: + vtd_iommu_unlock(s); entry->iova = 0; entry->translated_addr = 0; entry->addr_mask = 0; @@ -1258,10 +1284,13 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s) static void vtd_context_global_invalidate(IntelIOMMUState *s) { trace_vtd_inv_desc_cc_global(); + /* Protects context cache */ + vtd_iommu_lock(s); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { - vtd_reset_context_cache(s); + vtd_reset_context_cache_locked(s); } + vtd_iommu_unlock(s); vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation @@ -1313,7 +1342,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); + vtd_iommu_lock(s); vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); /* * Do switch address space when needed, in case if the * device passthrough bit is switched. @@ -1377,8 +1408,10 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) trace_vtd_inv_desc_iotlb_domain(domain_id); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); + vtd_iommu_unlock(s); QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), @@ -1426,7 +1459,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, info.domain_id = domain_id; info.addr = addr; info.mask = ~((1 << am) - 1); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iommu_unlock(s); vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); } @@ -2929,8 +2964,10 @@ static void vtd_init(IntelIOMMUState *s) s->cap |= VTD_CAP_CM; } - vtd_reset_context_cache(s); - vtd_reset_iotlb(s); + vtd_iommu_lock(s); + vtd_reset_context_cache_locked(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); /* Define registers with default values and bit semantics */ vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); @@ -3070,6 +3107,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) } QLIST_INIT(&s->vtd_as_with_notifiers); + qemu_mutex_init(&s->iommu_lock); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 032e33bcb2..016e74bedb 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -300,6 +300,12 @@ struct IntelIOMMUState { OnOffAuto intr_eim; /* Toggle for EIM cabability */ bool buggy_eim; /* Force buggy EIM unless eim=off */ uint8_t aw_bits; /* Host/IOVA address width (in bits) */ + + /* + * Protects IOMMU states in general. Currently it protects the + * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace. + */ + QemuMutex iommu_lock; }; /* Find the VTD Address space associated with the given bus pointer, -- cgit v1.2.3-55-g7522 From 4f8a62a933a79094e44bc1b16b63bb23e62d67b4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:12 +0800 Subject: intel-iommu: only do page walk for MAP notifiers For UNMAP-only IOMMU notifiers, we don't need to walk the page tables. Fasten that procedure by skipping the page table walk. That should boost performance for UNMAP-only notifiers like vhost. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 44 ++++++++++++++++++++++++++++++++++++++----- include/hw/i386/intel_iommu.h | 2 ++ 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'include/hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 8d4069de9f..38ccc741f9 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -138,6 +138,12 @@ static inline void vtd_iommu_unlock(IntelIOMMUState *s) qemu_mutex_unlock(&s->iommu_lock); } +/* Whether the address space needs to notify new mappings */ +static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) +{ + return as->notifier_flags & IOMMU_NOTIFIER_MAP; +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -1436,14 +1442,36 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; + hwaddr size = (1 << am) * VTD_PAGE_SIZE; QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE, - vtd_page_invalidate_notify_hook, - (void *)&vtd_as->iommu, true, s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* + * As long as we have MAP notifications registered in + * any of our IOMMU notifiers, we need to sync the + * shadow page table. + */ + vtd_page_walk(&ce, addr, addr + size, + vtd_page_invalidate_notify_hook, + (void *)&vtd_as->iommu, true, s->aw_bits); + } else { + /* + * For UNMAP-only notifiers, we don't need to walk the + * page tables. We just deliver the PSI down to + * invalidate caches. + */ + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = size - 1, + .perm = IOMMU_NONE, + }; + memory_region_notify_iommu(&vtd_as->iommu, entry); + } } } } @@ -2383,6 +2411,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, exit(1); } + /* Update per-address-space notifier flags */ + vtd_as->notifier_flags = new; + if (old == IOMMU_NOTIFIER_NONE) { QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); } else if (new == IOMMU_NOTIFIER_NONE) { @@ -2891,8 +2922,11 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) PCI_FUNC(vtd_as->devfn), VTD_CONTEXT_ENTRY_DID(ce.hi), ce.hi, ce.lo); - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, - s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* This is required only for MAP typed notifiers */ + vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, + s->aw_bits); + } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 016e74bedb..156f35e919 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -93,6 +93,8 @@ struct VTDAddressSpace { IntelIOMMUState *iommu_state; VTDContextCacheEntry context_cache_entry; QLIST_ENTRY(VTDAddressSpace) next; + /* Superset of notifier flags that this address space has */ + IOMMUNotifierFlag notifier_flags; }; struct VTDBus { -- cgit v1.2.3-55-g7522 From 63b88968f139b6a77f2f81e6f1eedf70c0170a85 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:17 +0800 Subject: intel-iommu: rework the page walk logic This patch fixes a potential small window that the DMA page table might be incomplete or invalid when the guest sends domain/context invalidations to a device. This can cause random DMA errors for assigned devices. This is a major change to the VT-d shadow page walking logic. It includes but is not limited to: - For each VTDAddressSpace, now we maintain what IOVA ranges we have mapped and what we have not. With that information, now we only send MAP or UNMAP when necessary. Say, we don't send MAP notifies if we know we have already mapped the range, meanwhile we don't send UNMAP notifies if we know we never mapped the range at all. - Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call in any places to resync the shadow page table for a device. - When we receive domain/context invalidation, we should not really run the replay logic, instead we use the new sync shadow page table API to resync the whole shadow page table without unmapping the whole region. After this change, we'll only do the page walk once for each domain invalidations (before this, it can be multiple, depending on number of notifiers per address space). While at it, the page walking logic is also refactored to be simpler. CC: QEMU Stable Reported-by: Jintack Lim Tested-by: Jintack Lim Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 213 ++++++++++++++++++++++++++++++------------ hw/i386/trace-events | 3 +- include/hw/i386/intel_iommu.h | 2 + 3 files changed, 159 insertions(+), 59 deletions(-) (limited to 'include/hw') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 61bb3d31e7..b5a09b7908 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -769,10 +769,77 @@ typedef struct { static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) { + VTDAddressSpace *as = info->as; vtd_page_walk_hook hook_fn = info->hook_fn; void *private = info->private; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (entry->perm == IOMMU_NONE && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } assert(hook_fn); + + /* Update local IOVA mapped ranges */ + if (entry->perm) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { + trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, + entry->translated_addr); + return 0; + } else { + /* + * Translation changed. Normally this should not + * happen, but it can happen when with buggy guest + * OSes. Note that there will be a small window that + * we don't have map at all. But that's the best + * effort we can do. The ideal way to emulate this is + * atomically modify the PTE to follow what has + * changed, but we can't. One example is that vfio + * driver only has VFIO_IOMMU_[UN]MAP_DMA but no + * interface to modify a mapping (meanwhile it seems + * meaningless to even provide one). Anyway, let's + * mark this as a TODO in case one day we'll have + * a better solution. + */ + IOMMUAccessFlags cache_perm = entry->perm; + int ret; + + /* Emulate an UNMAP */ + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); + ret = hook_fn(entry, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); + /* Recover the correct permission */ + entry->perm = cache_perm; + } + } + iova_tree_insert(as->iova_tree, &target); + } else { + if (!mapped) { + /* Skip since we didn't map this range at all */ + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + iova_tree_remove(as->iova_tree, &target); + } + trace_vtd_page_walk_one(info->domain_id, entry->iova, entry->translated_addr, entry->addr_mask, entry->perm); @@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; - entry.target_as = &address_space_memory; - entry.iova = iova & subpage_mask; - entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); - entry.addr_mask = ~subpage_mask; - - if (vtd_is_last_slpte(slpte, level)) { - /* NOTE: this is only meaningful if entry_valid == true */ - entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); - if (!entry_valid && !info->notify_unmap) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - ret = vtd_page_walk_one(&entry, info); - if (ret < 0) { - return ret; - } - } else { - if (!entry_valid) { - if (info->notify_unmap) { - /* - * The whole entry is invalid; unmap it all. - * Translated address is meaningless, zero it. - */ - entry.translated_addr = 0x0; - ret = vtd_page_walk_one(&entry, info); - if (ret < 0) { - return ret; - } - } else { - trace_vtd_page_walk_skip_perm(iova, iova_next); - } - goto next; - } + if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + /* + * This is a valid PDE (or even bigger than PDE). We need + * to walk one further level. + */ ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), iova, MIN(iova_next, end), level - 1, read_cur, write_cur, info); - if (ret < 0) { - return ret; - } + } else { + /* + * This means we are either: + * + * (1) the real page entry (either 4K page, or huge page) + * (2) the whole range is invalid + * + * In either case, we send an IOTLB notification down. + */ + entry.target_as = &address_space_memory; + entry.iova = iova & subpage_mask; + entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); + entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + ret = vtd_page_walk_one(&entry, info); + } + + if (ret < 0) { + return ret; } next: @@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, + void *private) +{ + memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); + return 0; +} + +/* If context entry is NULL, we'll try to fetch it on our own. */ +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, + VTDContextEntry *ce, + hwaddr addr, hwaddr size) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + vtd_page_walk_info info = { + .hook_fn = vtd_sync_shadow_page_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + .as = vtd_as, + }; + VTDContextEntry ce_cache; + int ret; + + if (ce) { + /* If the caller provided context entry, use it */ + ce_cache = *ce; + } else { + /* If the caller didn't provide ce, try to fetch */ + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce_cache); + if (ret) { + /* + * This should not really happen, but in case it happens, + * we just skip the sync for this time. After all we even + * don't have the root table pointer! + */ + trace_vtd_err("Detected invalid context entry when " + "trying to sync shadow page table"); + return 0; + } + } + + info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi); + + return vtd_page_walk(&ce_cache, addr, addr + size, &info); +} + +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +{ + return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX); +} + /* * Fetch translation type for specific device. Returns <0 if error * happens, otherwise return the shifted type to check against @@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s) VTDAddressSpace *vtd_as; QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } @@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, vtd_switch_address_space(vtd_as); /* * So a device is moving out of (or moving into) a - * domain, a replay() suites here to notify all the - * IOMMU_NOTIFIER_MAP registers about this change. + * domain, resync the shadow page table. * This won't bring bad even if we have no such * notifier registered - the IOMMU notification * framework will skip MAP notifications if that * happened. */ - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } @@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } -static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry, - void *private) -{ - memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); - return 0; -} - static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) @@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { if (vtd_as_has_map_notifier(vtd_as)) { - vtd_page_walk_info info = { - .hook_fn = vtd_page_invalidate_notify_hook, - .private = (void *)&vtd_as->iommu, - .notify_unmap = true, - .aw = s->aw_bits, - .as = vtd_as, - .domain_id = domain_id, - }; - /* * As long as we have MAP notifications registered in * any of our IOMMU notifiers, we need to sync the * shadow page table. */ - vtd_page_walk(&ce, addr, addr + size, &info); + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); } else { /* * For UNMAP-only notifiers, we don't need to walk the @@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) vtd_dev_as->devfn = (uint8_t)devfn; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; + vtd_dev_as->iova_tree = iova_tree_new(); /* * Memory region relationships looks like (Address range shows @@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) hwaddr start = n->start; hwaddr end = n->end; IntelIOMMUState *s = as->iommu_state; + DMAMap map; /* * Note: all the codes in this function has a assumption that IOVA @@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) VTD_PCI_FUNC(as->devfn), entry.iova, size); + map.iova = entry.iova; + map.size = entry.addr_mask; + iova_tree_remove(as->iova_tree, &map); + memory_region_notify_one(n, &entry); } diff --git a/hw/i386/trace-events b/hw/i386/trace-events index ca23ba9fad..e14d06ec83 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64 +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" -vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 156f35e919..fbfedcb1c0 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -27,6 +27,7 @@ #include "hw/i386/ioapic.h" #include "hw/pci/msi.h" #include "hw/sysbus.h" +#include "qemu/iova-tree.h" #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" #define INTEL_IOMMU_DEVICE(obj) \ @@ -95,6 +96,7 @@ struct VTDAddressSpace { QLIST_ENTRY(VTDAddressSpace) next; /* Superset of notifier flags that this address space has */ IOMMUNotifierFlag notifier_flags; + IOVATree *iova_tree; /* Traces mapped IOVA ranges */ }; struct VTDBus { -- cgit v1.2.3-55-g7522