From 50a6fa8f939a3448967592e5136afcc063f9a25d Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 24 Apr 2018 08:38:57 +0200 Subject: allocate pci id for mdpy mdpy is a sample pci device for vfio-mdev. Not (yet) merged upstream, patch available here: https://www.kraxel.org/cgit/linux/commit/?h=vfio-sample-display&id=6fd86cff3d7df38ab89625b16fdd6434b1c18749 Cc: Alex Williamson Signed-off-by: Gerd Hoffmann Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/pci/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index a9c3ee5aa2..990d6fcbde 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -101,6 +101,7 @@ extern bool pci_available; #define PCI_DEVICE_ID_REDHAT_PCIE_RP 0x000c #define PCI_DEVICE_ID_REDHAT_XHCI 0x000d #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e +#define PCI_DEVICE_ID_REDHAT_MDPY 0x000f #define PCI_DEVICE_ID_REDHAT_QXL 0x0100 #define FMT_PCIBUS PRIx64 -- cgit v1.2.3-55-g7522 From b7b126442977dcfa5641e38b084bd13a0b366cff Mon Sep 17 00:00:00 2001 From: Jonathan Helman Date: Mon, 19 Mar 2018 15:28:49 -0700 Subject: virtio-balloon: add hugetlb page allocation counts qemu should read and report hugetlb page allocation counts exported in the following kernel patch: commit 4c3ca37c4a4394978fd0f005625f6064ed2b9a64 Author: Jonathan Helman Date: Mon Mar 19 11:00:35 2018 -0700 virtio_balloon: export hugetlb page allocation counts Export the number of successful and failed hugetlb page allocations via the virtio balloon driver. These 2 counts come directly from the vm_events HTLB_BUDDY_PGALLOC and HTLB_BUDDY_PGALLOC_FAIL. Signed-off-by: Jonathan Helman Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Jason Wang --- docs/virtio-balloon-stats.txt | 2 ++ hw/virtio/virtio-balloon.c | 2 ++ include/standard-headers/linux/virtio_balloon.h | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/docs/virtio-balloon-stats.txt b/docs/virtio-balloon-stats.txt index 7a66d25da5..9985e1dffc 100644 --- a/docs/virtio-balloon-stats.txt +++ b/docs/virtio-balloon-stats.txt @@ -34,6 +34,8 @@ which will return a dictionary containing: - stat-total-memory - stat-available-memory - stat-disk-caches + - stat-htlb-pgalloc + - stat-htlb-pgfail o A key named last-update, which contains the last stats update timestamp in seconds. Since this timestamp is generated by the host, diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index f456cea2e7..1f7a87f094 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -52,6 +52,8 @@ static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory", [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches", + [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc", + [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail", [VIRTIO_BALLOON_S_NR] = NULL }; diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h index 7b0a41b8fc..e446805ae9 100644 --- a/include/standard-headers/linux/virtio_balloon.h +++ b/include/standard-headers/linux/virtio_balloon.h @@ -53,7 +53,9 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ #define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */ #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */ -#define VIRTIO_BALLOON_S_NR 8 +#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ +#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ +#define VIRTIO_BALLOON_S_NR 10 /* * Memory statistics structure. -- cgit v1.2.3-55-g7522 From d49bd359d186a83fbb66a23dafd2d6664b79aa6a Mon Sep 17 00:00:00 2001 From: Michael S. Tsirkin Date: Tue, 17 Apr 2018 21:45:56 +0300 Subject: include/standard-headers: add asm-x86/kvm_para.h Import asm-x86/kvm_para.h from linux where it can be easily used on Linux and non-Linux platforms. Signed-off-by: Michael S. Tsirkin --- include/standard-headers/asm-x86/kvm_para.h | 121 ++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 include/standard-headers/asm-x86/kvm_para.h (limited to 'include') diff --git a/include/standard-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h new file mode 100644 index 0000000000..53a85ae3ed --- /dev/null +++ b/include/standard-headers/asm-x86/kvm_para.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_X86_KVM_PARA_H +#define _ASM_X86_KVM_PARA_H + +#include "standard-headers/linux/types.h" + +/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It + * should be used to determine that a VM is running under KVM. + */ +#define KVM_CPUID_SIGNATURE 0x40000000 + +/* This CPUID returns two feature bitmaps in eax, edx. Before enabling + * a particular paravirtualization, the appropriate feature bit should + * be checked in eax. The performance hint feature bit should be checked + * in edx. + */ +#define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKSOURCE 0 +#define KVM_FEATURE_NOP_IO_DELAY 1 +#define KVM_FEATURE_MMU_OP 2 +/* This indicates that the new set of kvmclock msrs + * are available. The use of 0x11 and 0x12 is deprecated + */ +#define KVM_FEATURE_CLOCKSOURCE2 3 +#define KVM_FEATURE_ASYNC_PF 4 +#define KVM_FEATURE_STEAL_TIME 5 +#define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 +#define KVM_FEATURE_PV_TLB_FLUSH 9 +#define KVM_FEATURE_ASYNC_PF_VMEXIT 10 + +#define KVM_HINTS_DEDICATED 0 + +/* The last 8 bits are used to indicate how to interpret the flags field + * in pvclock structure. If no bits are set, all flags are ignored. + */ +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +#define KVM_MSR_ENABLED 1 +/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ +#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 +#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 +#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 +#define MSR_KVM_STEAL_TIME 0x4b564d03 +#define MSR_KVM_PV_EOI_EN 0x4b564d04 + +struct kvm_steal_time { + uint64_t steal; + uint32_t version; + uint32_t flags; + uint8_t preempted; + uint8_t uint8_t_pad[3]; + uint32_t pad[11]; +}; + +#define KVM_VCPU_PREEMPTED (1 << 0) +#define KVM_VCPU_FLUSH_TLB (1 << 1) + +#define KVM_CLOCK_PAIRING_WALLCLOCK 0 +struct kvm_clock_pairing { + int64_t sec; + int64_t nsec; + uint64_t tsc; + uint32_t flags; + uint32_t pad[9]; +}; + +#define KVM_STEAL_ALIGNMENT_BITS 5 +#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) +#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) + +#define KVM_MAX_MMU_OP_BATCH 32 + +#define KVM_ASYNC_PF_ENABLED (1 << 0) +#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) +#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) + +/* Operations for KVM_HC_MMU_OP */ +#define KVM_MMU_OP_WRITE_PTE 1 +#define KVM_MMU_OP_FLUSH_TLB 2 +#define KVM_MMU_OP_RELEASE_PT 3 + +/* Payload for KVM_HC_MMU_OP */ +struct kvm_mmu_op_header { + uint32_t op; + uint32_t pad; +}; + +struct kvm_mmu_op_write_pte { + struct kvm_mmu_op_header header; + uint64_t pte_phys; + uint64_t pte_val; +}; + +struct kvm_mmu_op_flush_tlb { + struct kvm_mmu_op_header header; +}; + +struct kvm_mmu_op_release_pt { + struct kvm_mmu_op_header header; + uint64_t pt_phys; +}; + +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 +#define KVM_PV_REASON_PAGE_READY 2 + +struct kvm_vcpu_pv_apf_data { + uint32_t reason; + uint8_t pad[60]; + uint32_t enabled; +}; + +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 + +#endif /* _ASM_X86_KVM_PARA_H */ -- cgit v1.2.3-55-g7522 From 1814eab67398029a927847dfd29198a26aeaf7d8 Mon Sep 17 00:00:00 2001 From: Michael S. Tsirkin Date: Tue, 17 Apr 2018 21:47:50 +0300 Subject: x86/cpu: use standard-headers/asm-x86.kvm_para.h Switch to the header we imported from Linux, this allows us to drop a hack in kvm_i386.h. More code will be dropped in the next patch. Signed-off-by: Michael S. Tsirkin --- hw/i386/kvm/clock.c | 2 +- include/sysemu/kvm.h | 1 - target/i386/cpu.c | 4 +--- target/i386/cpu.h | 2 -- target/i386/kvm.c | 4 ++-- target/i386/kvm_i386.h | 6 ------ 6 files changed, 4 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 7dac319403..0bf1c60a06 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -26,7 +26,7 @@ #include "qapi/error.h" #include -#include +#include "standard-headers/asm-x86/kvm_para.h" #define TYPE_KVM_CLOCK "kvmclock" #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 23669c4d5a..0b64b8e067 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -22,7 +22,6 @@ #ifdef NEED_CPU_H # ifdef CONFIG_KVM # include -# include # define CONFIG_KVM_IS_POSSIBLE # endif #else diff --git a/target/i386/cpu.c b/target/i386/cpu.c index d95310ffd4..94260412e2 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -40,9 +40,7 @@ #include "qom/qom-qobject.h" #include "sysemu/arch_init.h" -#if defined(CONFIG_KVM) -#include -#endif +#include "standard-headers/asm-x86/kvm_para.h" #include "sysemu/sysemu.h" #include "hw/qdev-properties.h" diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 8ac13f6c2c..664504610e 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -688,8 +688,6 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) /* Speculation Control */ #define CPUID_7_0_EDX_SPEC_CTRL_SSBD (1U << 31) /* Speculative Store Bypass Disable */ -#define KVM_HINTS_DEDICATED (1U << 0) - #define CPUID_8000_0008_EBX_IBPB (1U << 12) /* Indirect Branch Prediction Barrier */ #define CPUID_XSAVE_XSAVEOPT (1U << 0) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 0c656a91a4..6511329d11 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -18,7 +18,7 @@ #include #include -#include +#include "standard-headers/asm-x86/kvm_para.h" #include "qemu-common.h" #include "cpu.h" @@ -387,7 +387,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, ret &= ~(1U << KVM_FEATURE_PV_UNHALT); } } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) { - ret |= KVM_HINTS_DEDICATED; + ret |= 1U << KVM_HINTS_DEDICATED; found = 1; } diff --git a/target/i386/kvm_i386.h b/target/i386/kvm_i386.h index 1de9876cd9..e5df24cad1 100644 --- a/target/i386/kvm_i386.h +++ b/target/i386/kvm_i386.h @@ -30,12 +30,6 @@ #define kvm_pic_in_kernel() 0 #define kvm_ioapic_in_kernel() 0 -/* These constants must never be used at runtime if kvm_enabled() is false. - * They exist so we don't need #ifdefs around KVM-specific code that already - * checks kvm_enabled() properly. - */ -#define KVM_CPUID_FEATURES 0 - #endif /* CONFIG_KVM */ bool kvm_allows_irq0_override(void); -- cgit v1.2.3-55-g7522 From 6f80e6170ede13605817e5c0ca73db0de7bdf261 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 12 Apr 2018 23:12:30 +0800 Subject: virtio: support setting memory region based host notifier This patch introduces the support for setting memory region based host notifiers for virtio device. This is helpful when using a hardware accelerator for a virtio device, because hardware heavily depends on the notification, this will allow the guest driver in the VM to notify the hardware directly. Signed-off-by: Tiwei Bie Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-pci.c | 22 ++++++++++++++++++++++ hw/virtio/virtio.c | 13 +++++++++++++ include/hw/virtio/virtio-bus.h | 2 ++ include/hw/virtio/virtio.h | 2 ++ 4 files changed, 39 insertions(+) (limited to 'include') diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1e8ab7bbc5..5eb0c323ca 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1037,6 +1037,27 @@ assign_error: return r; } +static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n, + MemoryRegion *mr, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + int offset; + + if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) || + virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) { + return -1; + } + + if (assign) { + offset = virtio_pci_queue_mem_mult(proxy) * n; + memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1); + } else { + memory_region_del_subregion(&proxy->notify.mr, mr); + } + + return 0; +} + static void virtio_pci_vmstate_change(DeviceState *d, bool running) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); @@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) k->has_extra_state = virtio_pci_has_extra_state; k->query_guest_notifiers = virtio_pci_query_guest_notifiers; k->set_guest_notifiers = virtio_pci_set_guest_notifiers; + k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr; k->vmstate_change = virtio_pci_vmstate_change; k->pre_plugged = virtio_pci_pre_plugged; k->device_plugged = virtio_pci_device_plugged; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 006d3d1148..1debb0147b 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) return &vq->host_notifier; } +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (k->set_host_notifier_mr) { + return k->set_host_notifier_mr(qbus->parent, n, mr, assign); + } + + return -1; +} + void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) { g_free(vdev->bus_name); diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h index ced3d2d2b0..7fec9dc929 100644 --- a/include/hw/virtio/virtio-bus.h +++ b/include/hw/virtio/virtio-bus.h @@ -52,6 +52,8 @@ typedef struct VirtioBusClass { bool (*has_extra_state)(DeviceState *d); bool (*query_guest_notifiers)(DeviceState *d); int (*set_guest_notifiers)(DeviceState *d, int nvqs, bool assign); + int (*set_host_notifier_mr)(DeviceState *d, int n, + MemoryRegion *mr, bool assign); void (*vmstate_change)(DeviceState *d, bool running); /* * Expose the features the transport layer supports before diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 098bdaaea3..9c1fa07d6d 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -239,6 +239,8 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align); void virtio_queue_notify(VirtIODevice *vdev, int n); uint16_t virtio_queue_vector(VirtIODevice *vdev, int n); void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector); +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign); int virtio_set_status(VirtIODevice *vdev, uint8_t val); void virtio_reset(void *opaque); void virtio_update_irq(VirtIODevice *vdev); -- cgit v1.2.3-55-g7522 From 1a97a478e69534f70ebbed1f27c315dbdc23f94f Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 21 May 2018 10:32:00 -0600 Subject: nvdimm: fix typo in label-size definition Signed-off-by: Ross Zwisler Fixes: commit da6789c27c2e ("nvdimm: add a macro for property "label-size"") Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov Cc: Haozhong Zhang Cc: Michael S. Tsirkin Cc: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/mem/nvdimm.c | 2 +- include/hw/mem/nvdimm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index acb656b672..4087aca25e 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp) static void nvdimm_init(Object *obj) { - object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", + object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); object_property_add_bool(obj, NVDIMM_UNARMED_PROP, diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 7fd87c4e1c..74c60332e1 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -48,7 +48,7 @@ #define NVDIMM_GET_CLASS(obj) OBJECT_GET_CLASS(NVDIMMClass, (obj), \ TYPE_NVDIMM) -#define NVDIMM_LABLE_SIZE_PROP "label-size" +#define NVDIMM_LABEL_SIZE_PROP "label-size" #define NVDIMM_UNARMED_PROP "unarmed" struct NVDIMMDevice { -- cgit v1.2.3-55-g7522 From b4a4ba0d68f50f218ee3957b6638dbee32a5eeef Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:10 +0800 Subject: intel-iommu: remove IntelIOMMUNotifierNode That is not really necessary. Removing that node struct and put the list entry directly into VTDAddressSpace. It simplfies the code a lot. Since at it, rename the old notifiers_list into vtd_as_with_notifiers. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 41 +++++++++++------------------------------ include/hw/i386/intel_iommu.h | 9 ++------- 2 files changed, 13 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index b359efd6f9..3df90457f8 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1248,10 +1248,10 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_iommu_replay_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; - QLIST_FOREACH(node, &s->notifiers_list, next) { - memory_region_iommu_replay_all(&node->vtd_as->iommu); + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + memory_region_iommu_replay_all(&vtd_as->iommu); } } @@ -1372,7 +1372,6 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) { - IntelIOMMUNotifierNode *node; VTDContextEntry ce; VTDAddressSpace *vtd_as; @@ -1381,8 +1380,7 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { @@ -1402,12 +1400,11 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; - QLIST_FOREACH(node, &(s->notifiers_list), next) { - VTDAddressSpace *vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { @@ -2344,8 +2341,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; - IntelIOMMUNotifierNode *node = NULL; - IntelIOMMUNotifierNode *next_node = NULL; if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { error_report("We need to set caching-mode=1 for intel-iommu to enable " @@ -2354,21 +2349,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, } if (old == IOMMU_NOTIFIER_NONE) { - node = g_malloc0(sizeof(*node)); - node->vtd_as = vtd_as; - QLIST_INSERT_HEAD(&s->notifiers_list, node, next); - return; - } - - /* update notifier node with new flags */ - QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) { - if (node->vtd_as == vtd_as) { - if (new == IOMMU_NOTIFIER_NONE) { - QLIST_REMOVE(node, next); - g_free(node); - } - return; - } + QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(vtd_as, next); } } @@ -2838,12 +2821,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) static void vtd_address_space_unmap_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; VTDAddressSpace *vtd_as; IOMMUNotifier *n; - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { vtd_address_space_unmap(vtd_as, n); } @@ -3088,7 +3069,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) return; } - QLIST_INIT(&s->notifiers_list); + QLIST_INIT(&s->vtd_as_with_notifiers); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 45ec8919b6..032e33bcb2 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -67,7 +67,6 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry; typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress; typedef struct VTDIrq VTDIrq; typedef struct VTD_MSIMessage VTD_MSIMessage; -typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode; /* Context-Entry */ struct VTDContextEntry { @@ -93,6 +92,7 @@ struct VTDAddressSpace { MemoryRegion iommu_ir; /* Interrupt region: 0xfeeXXXXX */ IntelIOMMUState *iommu_state; VTDContextCacheEntry context_cache_entry; + QLIST_ENTRY(VTDAddressSpace) next; }; struct VTDBus { @@ -253,11 +253,6 @@ struct VTD_MSIMessage { /* When IR is enabled, all MSI/MSI-X data bits should be zero */ #define VTD_IR_MSI_DATA (0) -struct IntelIOMMUNotifierNode { - VTDAddressSpace *vtd_as; - QLIST_ENTRY(IntelIOMMUNotifierNode) next; -}; - /* The iommu (DMAR) device state struct */ struct IntelIOMMUState { X86IOMMUState x86_iommu; @@ -295,7 +290,7 @@ struct IntelIOMMUState { GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by PCIBus* reference */ VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */ /* list of registered notifiers */ - QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list; + QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers; /* interrupt remapping */ bool intr_enabled; /* Whether guest enabled IR */ -- cgit v1.2.3-55-g7522 From 1d9efa73e12ddf361ea997c2d532cc4afa6674d1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:11 +0800 Subject: intel-iommu: add iommu lock SECURITY IMPLICATION: this patch fixes a potential race when multiple threads access the IOMMU IOTLB cache. Add a per-iommu big lock to protect IOMMU status. Currently the only thing to be protected is the IOTLB/context cache, since that can be accessed even without BQL, e.g., in IO dataplane. Note that we don't need to protect device page tables since that's fully controlled by the guest kernel. However there is still possibility that malicious drivers will program the device to not obey the rule. In that case QEMU can't really do anything useful, instead the guest itself will be responsible for all uncertainties. CC: QEMU Stable Reported-by: Fam Zheng Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 56 ++++++++++++++++++++++++++++++++++++------- include/hw/i386/intel_iommu.h | 6 +++++ 2 files changed, 53 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3df90457f8..8d4069de9f 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -128,6 +128,16 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, return new_val; } +static inline void vtd_iommu_lock(IntelIOMMUState *s) +{ + qemu_mutex_lock(&s->iommu_lock); +} + +static inline void vtd_iommu_unlock(IntelIOMMUState *s) +{ + qemu_mutex_unlock(&s->iommu_lock); +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -172,9 +182,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, } /* Reset all the gen of VTDAddressSpace to zero and set the gen of - * IntelIOMMUState to 1. + * IntelIOMMUState to 1. Must be called with IOMMU lock held. */ -static void vtd_reset_context_cache(IntelIOMMUState *s) +static void vtd_reset_context_cache_locked(IntelIOMMUState *s) { VTDAddressSpace *vtd_as; VTDBus *vtd_bus; @@ -197,12 +207,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s) s->context_cache_gen = 1; } -static void vtd_reset_iotlb(IntelIOMMUState *s) +/* Must be called with IOMMU lock held. */ +static void vtd_reset_iotlb_locked(IntelIOMMUState *s) { assert(s->iotlb); g_hash_table_remove_all(s->iotlb); } +static void vtd_reset_iotlb(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); +} + static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { @@ -215,6 +233,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; } +/* Must be called with IOMMU lock held */ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, hwaddr addr) { @@ -235,6 +254,7 @@ out: return entry; } +/* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint16_t domain_id, hwaddr addr, uint64_t slpte, uint8_t access_flags, uint32_t level) @@ -246,7 +266,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { trace_vtd_iotlb_reset("iotlb exceeds size limit"); - vtd_reset_iotlb(s); + vtd_reset_iotlb_locked(s); } entry->gfn = gfn; @@ -1106,7 +1126,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, IntelIOMMUState *s = vtd_as->iommu_state; VTDContextEntry ce; uint8_t bus_num = pci_bus_num(bus); - VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry; + VTDContextCacheEntry *cc_entry; uint64_t slpte, page_mask; uint32_t level; uint16_t source_id = vtd_make_source_id(bus_num, devfn); @@ -1123,6 +1143,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, */ assert(!vtd_is_interrupt_addr(addr)); + vtd_iommu_lock(s); + + cc_entry = &vtd_as->context_cache_entry; + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { @@ -1182,7 +1206,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, * IOMMU region can be swapped back. */ vtd_pt_enable_fast_path(s, source_id); - + vtd_iommu_unlock(s); return true; } @@ -1203,6 +1227,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, access_flags, level); out: + vtd_iommu_unlock(s); entry->iova = addr & page_mask; entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; @@ -1210,6 +1235,7 @@ out: return true; error: + vtd_iommu_unlock(s); entry->iova = 0; entry->translated_addr = 0; entry->addr_mask = 0; @@ -1258,10 +1284,13 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s) static void vtd_context_global_invalidate(IntelIOMMUState *s) { trace_vtd_inv_desc_cc_global(); + /* Protects context cache */ + vtd_iommu_lock(s); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { - vtd_reset_context_cache(s); + vtd_reset_context_cache_locked(s); } + vtd_iommu_unlock(s); vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation @@ -1313,7 +1342,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); + vtd_iommu_lock(s); vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); /* * Do switch address space when needed, in case if the * device passthrough bit is switched. @@ -1377,8 +1408,10 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) trace_vtd_inv_desc_iotlb_domain(domain_id); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); + vtd_iommu_unlock(s); QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), @@ -1426,7 +1459,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, info.domain_id = domain_id; info.addr = addr; info.mask = ~((1 << am) - 1); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iommu_unlock(s); vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); } @@ -2929,8 +2964,10 @@ static void vtd_init(IntelIOMMUState *s) s->cap |= VTD_CAP_CM; } - vtd_reset_context_cache(s); - vtd_reset_iotlb(s); + vtd_iommu_lock(s); + vtd_reset_context_cache_locked(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); /* Define registers with default values and bit semantics */ vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); @@ -3070,6 +3107,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) } QLIST_INIT(&s->vtd_as_with_notifiers); + qemu_mutex_init(&s->iommu_lock); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 032e33bcb2..016e74bedb 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -300,6 +300,12 @@ struct IntelIOMMUState { OnOffAuto intr_eim; /* Toggle for EIM cabability */ bool buggy_eim; /* Force buggy EIM unless eim=off */ uint8_t aw_bits; /* Host/IOVA address width (in bits) */ + + /* + * Protects IOMMU states in general. Currently it protects the + * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace. + */ + QemuMutex iommu_lock; }; /* Find the VTD Address space associated with the given bus pointer, -- cgit v1.2.3-55-g7522 From 4f8a62a933a79094e44bc1b16b63bb23e62d67b4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:12 +0800 Subject: intel-iommu: only do page walk for MAP notifiers For UNMAP-only IOMMU notifiers, we don't need to walk the page tables. Fasten that procedure by skipping the page table walk. That should boost performance for UNMAP-only notifiers like vhost. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 44 ++++++++++++++++++++++++++++++++++++++----- include/hw/i386/intel_iommu.h | 2 ++ 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 8d4069de9f..38ccc741f9 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -138,6 +138,12 @@ static inline void vtd_iommu_unlock(IntelIOMMUState *s) qemu_mutex_unlock(&s->iommu_lock); } +/* Whether the address space needs to notify new mappings */ +static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) +{ + return as->notifier_flags & IOMMU_NOTIFIER_MAP; +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -1436,14 +1442,36 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; + hwaddr size = (1 << am) * VTD_PAGE_SIZE; QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE, - vtd_page_invalidate_notify_hook, - (void *)&vtd_as->iommu, true, s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* + * As long as we have MAP notifications registered in + * any of our IOMMU notifiers, we need to sync the + * shadow page table. + */ + vtd_page_walk(&ce, addr, addr + size, + vtd_page_invalidate_notify_hook, + (void *)&vtd_as->iommu, true, s->aw_bits); + } else { + /* + * For UNMAP-only notifiers, we don't need to walk the + * page tables. We just deliver the PSI down to + * invalidate caches. + */ + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = size - 1, + .perm = IOMMU_NONE, + }; + memory_region_notify_iommu(&vtd_as->iommu, entry); + } } } } @@ -2383,6 +2411,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, exit(1); } + /* Update per-address-space notifier flags */ + vtd_as->notifier_flags = new; + if (old == IOMMU_NOTIFIER_NONE) { QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); } else if (new == IOMMU_NOTIFIER_NONE) { @@ -2891,8 +2922,11 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) PCI_FUNC(vtd_as->devfn), VTD_CONTEXT_ENTRY_DID(ce.hi), ce.hi, ce.lo); - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, - s->aw_bits); + if (vtd_as_has_map_notifier(vtd_as)) { + /* This is required only for MAP typed notifiers */ + vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false, + s->aw_bits); + } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 016e74bedb..156f35e919 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -93,6 +93,8 @@ struct VTDAddressSpace { IntelIOMMUState *iommu_state; VTDContextCacheEntry context_cache_entry; QLIST_ENTRY(VTDAddressSpace) next; + /* Superset of notifier flags that this address space has */ + IOMMUNotifierFlag notifier_flags; }; struct VTDBus { -- cgit v1.2.3-55-g7522 From eecf5eedbdc0fc04f39abcf3afeedfbf21b25ca4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:16 +0800 Subject: util: implement simple iova tree Introduce a simplest iova tree implementation based on GTree. CC: QEMU Stable Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 6 +++ include/qemu/iova-tree.h | 134 +++++++++++++++++++++++++++++++++++++++++++++++ util/Makefile.objs | 1 + util/iova-tree.c | 114 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 255 insertions(+) create mode 100644 include/qemu/iova-tree.h create mode 100644 util/iova-tree.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index e187b1f18f..f07fcee72e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1783,6 +1783,12 @@ F: include/sysemu/replay.h F: docs/replay.txt F: stubs/replay.c +IOVA Tree +M: Peter Xu +S: Maintained +F: include/qemu/iova-tree.h +F: util/iova-tree.c + Usermode Emulation ------------------ Overall diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h new file mode 100644 index 0000000000..b061932097 --- /dev/null +++ b/include/qemu/iova-tree.h @@ -0,0 +1,134 @@ +/* + * An very simplified iova tree implementation based on GTree. + * + * Copyright 2018 Red Hat, Inc. + * + * Authors: + * Peter Xu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + */ +#ifndef IOVA_TREE_H +#define IOVA_TREE_H + +/* + * Currently the iova tree will only allow to keep ranges + * information, and no extra user data is allowed for each element. A + * benefit is that we can merge adjacent ranges internally within the + * tree. It can save a lot of memory when the ranges are splitted but + * mostly continuous. + * + * Note that current implementation does not provide any thread + * protections. Callers of the iova tree should be responsible + * for the thread safety issue. + */ + +#include "qemu/osdep.h" +#include "exec/memory.h" +#include "exec/hwaddr.h" + +#define IOVA_OK (0) +#define IOVA_ERR_INVALID (-1) /* Invalid parameters */ +#define IOVA_ERR_OVERLAP (-2) /* IOVA range overlapped */ + +typedef struct IOVATree IOVATree; +typedef struct DMAMap { + hwaddr iova; + hwaddr translated_addr; + hwaddr size; /* Inclusive */ + IOMMUAccessFlags perm; +} QEMU_PACKED DMAMap; +typedef gboolean (*iova_tree_iterator)(DMAMap *map); + +/** + * iova_tree_new: + * + * Create a new iova tree. + * + * Returns: the tree pointer when succeeded, or NULL if error. + */ +IOVATree *iova_tree_new(void); + +/** + * iova_tree_insert: + * + * @tree: the iova tree to insert + * @map: the mapping to insert + * + * Insert an iova range to the tree. If there is overlapped + * ranges, IOVA_ERR_OVERLAP will be returned. + * + * Return: 0 if succeeded, or <0 if error. + */ +int iova_tree_insert(IOVATree *tree, DMAMap *map); + +/** + * iova_tree_remove: + * + * @tree: the iova tree to remove range from + * @map: the map range to remove + * + * Remove mappings from the tree that are covered by the map range + * provided. The range does not need to be exactly what has inserted, + * all the mappings that are included in the provided range will be + * removed from the tree. Here map->translated_addr is meaningless. + * + * Return: 0 if succeeded, or <0 if error. + */ +int iova_tree_remove(IOVATree *tree, DMAMap *map); + +/** + * iova_tree_find: + * + * @tree: the iova tree to search from + * @map: the mapping to search + * + * Search for a mapping in the iova tree that overlaps with the + * mapping range specified. Only the first found mapping will be + * returned. + * + * Return: DMAMap pointer if found, or NULL if not found. Note that + * the returned DMAMap pointer is maintained internally. User should + * only read the content but never modify or free the content. Also, + * user is responsible to make sure the pointer is valid (say, no + * concurrent deletion in progress). + */ +DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map); + +/** + * iova_tree_find_address: + * + * @tree: the iova tree to search from + * @iova: the iova address to find + * + * Similar to iova_tree_find(), but it tries to find mapping with + * range iova=iova & size=0. + * + * Return: same as iova_tree_find(). + */ +DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova); + +/** + * iova_tree_foreach: + * + * @tree: the iova tree to iterate on + * @iterator: the interator for the mappings, return true to stop + * + * Iterate over the iova tree. + * + * Return: 1 if found any overlap, 0 if not, <0 if error. + */ +void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator); + +/** + * iova_tree_destroy: + * + * @tree: the iova tree to destroy + * + * Destroy an existing iova tree. + * + * Return: None. + */ +void iova_tree_destroy(IOVATree *tree); + +#endif diff --git a/util/Makefile.objs b/util/Makefile.objs index 728c3541db..e1c3fed4dc 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -47,4 +47,5 @@ util-obj-y += qht.o util-obj-y += range.o util-obj-y += stats64.o util-obj-y += systemd.o +util-obj-y += iova-tree.o util-obj-$(CONFIG_LINUX) += vfio-helpers.o diff --git a/util/iova-tree.c b/util/iova-tree.c new file mode 100644 index 0000000000..2d9cebfc89 --- /dev/null +++ b/util/iova-tree.c @@ -0,0 +1,114 @@ +/* + * IOVA tree implementation based on GTree. + * + * Copyright 2018 Red Hat, Inc. + * + * Authors: + * Peter Xu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + */ + +#include +#include "qemu/iova-tree.h" + +struct IOVATree { + GTree *tree; +}; + +static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data) +{ + const DMAMap *m1 = a, *m2 = b; + + if (m1->iova > m2->iova + m2->size) { + return 1; + } + + if (m1->iova + m1->size < m2->iova) { + return -1; + } + + /* Overlapped */ + return 0; +} + +IOVATree *iova_tree_new(void) +{ + IOVATree *iova_tree = g_new0(IOVATree, 1); + + /* We don't have values actually, no need to free */ + iova_tree->tree = g_tree_new_full(iova_tree_compare, NULL, g_free, NULL); + + return iova_tree; +} + +DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map) +{ + return g_tree_lookup(tree->tree, map); +} + +DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova) +{ + DMAMap map = { .iova = iova, .size = 0 }; + + return iova_tree_find(tree, &map); +} + +static inline void iova_tree_insert_internal(GTree *gtree, DMAMap *range) +{ + /* Key and value are sharing the same range data */ + g_tree_insert(gtree, range, range); +} + +int iova_tree_insert(IOVATree *tree, DMAMap *map) +{ + DMAMap *new; + + if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) { + return IOVA_ERR_INVALID; + } + + /* We don't allow to insert range that overlaps with existings */ + if (iova_tree_find(tree, map)) { + return IOVA_ERR_OVERLAP; + } + + new = g_new0(DMAMap, 1); + memcpy(new, map, sizeof(*new)); + iova_tree_insert_internal(tree->tree, new); + + return IOVA_OK; +} + +static gboolean iova_tree_traverse(gpointer key, gpointer value, + gpointer data) +{ + iova_tree_iterator iterator = data; + DMAMap *map = key; + + g_assert(key == value); + + return iterator(map); +} + +void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator) +{ + g_tree_foreach(tree->tree, iova_tree_traverse, iterator); +} + +int iova_tree_remove(IOVATree *tree, DMAMap *map) +{ + DMAMap *overlap; + + while ((overlap = iova_tree_find(tree, map))) { + g_tree_remove(tree->tree, overlap); + } + + return IOVA_OK; +} + +void iova_tree_destroy(IOVATree *tree) +{ + g_tree_destroy(tree->tree); + g_free(tree); +} -- cgit v1.2.3-55-g7522 From 63b88968f139b6a77f2f81e6f1eedf70c0170a85 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 18 May 2018 15:25:17 +0800 Subject: intel-iommu: rework the page walk logic This patch fixes a potential small window that the DMA page table might be incomplete or invalid when the guest sends domain/context invalidations to a device. This can cause random DMA errors for assigned devices. This is a major change to the VT-d shadow page walking logic. It includes but is not limited to: - For each VTDAddressSpace, now we maintain what IOVA ranges we have mapped and what we have not. With that information, now we only send MAP or UNMAP when necessary. Say, we don't send MAP notifies if we know we have already mapped the range, meanwhile we don't send UNMAP notifies if we know we never mapped the range at all. - Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call in any places to resync the shadow page table for a device. - When we receive domain/context invalidation, we should not really run the replay logic, instead we use the new sync shadow page table API to resync the whole shadow page table without unmapping the whole region. After this change, we'll only do the page walk once for each domain invalidations (before this, it can be multiple, depending on number of notifiers per address space). While at it, the page walking logic is also refactored to be simpler. CC: QEMU Stable Reported-by: Jintack Lim Tested-by: Jintack Lim Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 213 ++++++++++++++++++++++++++++++------------ hw/i386/trace-events | 3 +- include/hw/i386/intel_iommu.h | 2 + 3 files changed, 159 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 61bb3d31e7..b5a09b7908 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -769,10 +769,77 @@ typedef struct { static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) { + VTDAddressSpace *as = info->as; vtd_page_walk_hook hook_fn = info->hook_fn; void *private = info->private; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (entry->perm == IOMMU_NONE && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } assert(hook_fn); + + /* Update local IOVA mapped ranges */ + if (entry->perm) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { + trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, + entry->translated_addr); + return 0; + } else { + /* + * Translation changed. Normally this should not + * happen, but it can happen when with buggy guest + * OSes. Note that there will be a small window that + * we don't have map at all. But that's the best + * effort we can do. The ideal way to emulate this is + * atomically modify the PTE to follow what has + * changed, but we can't. One example is that vfio + * driver only has VFIO_IOMMU_[UN]MAP_DMA but no + * interface to modify a mapping (meanwhile it seems + * meaningless to even provide one). Anyway, let's + * mark this as a TODO in case one day we'll have + * a better solution. + */ + IOMMUAccessFlags cache_perm = entry->perm; + int ret; + + /* Emulate an UNMAP */ + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); + ret = hook_fn(entry, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); + /* Recover the correct permission */ + entry->perm = cache_perm; + } + } + iova_tree_insert(as->iova_tree, &target); + } else { + if (!mapped) { + /* Skip since we didn't map this range at all */ + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + iova_tree_remove(as->iova_tree, &target); + } + trace_vtd_page_walk_one(info->domain_id, entry->iova, entry->translated_addr, entry->addr_mask, entry->perm); @@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; - entry.target_as = &address_space_memory; - entry.iova = iova & subpage_mask; - entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); - entry.addr_mask = ~subpage_mask; - - if (vtd_is_last_slpte(slpte, level)) { - /* NOTE: this is only meaningful if entry_valid == true */ - entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); - if (!entry_valid && !info->notify_unmap) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - ret = vtd_page_walk_one(&entry, info); - if (ret < 0) { - return ret; - } - } else { - if (!entry_valid) { - if (info->notify_unmap) { - /* - * The whole entry is invalid; unmap it all. - * Translated address is meaningless, zero it. - */ - entry.translated_addr = 0x0; - ret = vtd_page_walk_one(&entry, info); - if (ret < 0) { - return ret; - } - } else { - trace_vtd_page_walk_skip_perm(iova, iova_next); - } - goto next; - } + if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + /* + * This is a valid PDE (or even bigger than PDE). We need + * to walk one further level. + */ ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), iova, MIN(iova_next, end), level - 1, read_cur, write_cur, info); - if (ret < 0) { - return ret; - } + } else { + /* + * This means we are either: + * + * (1) the real page entry (either 4K page, or huge page) + * (2) the whole range is invalid + * + * In either case, we send an IOTLB notification down. + */ + entry.target_as = &address_space_memory; + entry.iova = iova & subpage_mask; + entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); + entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + ret = vtd_page_walk_one(&entry, info); + } + + if (ret < 0) { + return ret; } next: @@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, + void *private) +{ + memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); + return 0; +} + +/* If context entry is NULL, we'll try to fetch it on our own. */ +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, + VTDContextEntry *ce, + hwaddr addr, hwaddr size) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + vtd_page_walk_info info = { + .hook_fn = vtd_sync_shadow_page_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + .as = vtd_as, + }; + VTDContextEntry ce_cache; + int ret; + + if (ce) { + /* If the caller provided context entry, use it */ + ce_cache = *ce; + } else { + /* If the caller didn't provide ce, try to fetch */ + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce_cache); + if (ret) { + /* + * This should not really happen, but in case it happens, + * we just skip the sync for this time. After all we even + * don't have the root table pointer! + */ + trace_vtd_err("Detected invalid context entry when " + "trying to sync shadow page table"); + return 0; + } + } + + info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi); + + return vtd_page_walk(&ce_cache, addr, addr + size, &info); +} + +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +{ + return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX); +} + /* * Fetch translation type for specific device. Returns <0 if error * happens, otherwise return the shifted type to check against @@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s) VTDAddressSpace *vtd_as; QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } @@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, vtd_switch_address_space(vtd_as); /* * So a device is moving out of (or moving into) a - * domain, a replay() suites here to notify all the - * IOMMU_NOTIFIER_MAP registers about this change. + * domain, resync the shadow page table. * This won't bring bad even if we have no such * notifier registered - the IOMMU notification * framework will skip MAP notifications if that * happened. */ - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } @@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } -static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry, - void *private) -{ - memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); - return 0; -} - static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) @@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { if (vtd_as_has_map_notifier(vtd_as)) { - vtd_page_walk_info info = { - .hook_fn = vtd_page_invalidate_notify_hook, - .private = (void *)&vtd_as->iommu, - .notify_unmap = true, - .aw = s->aw_bits, - .as = vtd_as, - .domain_id = domain_id, - }; - /* * As long as we have MAP notifications registered in * any of our IOMMU notifiers, we need to sync the * shadow page table. */ - vtd_page_walk(&ce, addr, addr + size, &info); + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); } else { /* * For UNMAP-only notifiers, we don't need to walk the @@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) vtd_dev_as->devfn = (uint8_t)devfn; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; + vtd_dev_as->iova_tree = iova_tree_new(); /* * Memory region relationships looks like (Address range shows @@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) hwaddr start = n->start; hwaddr end = n->end; IntelIOMMUState *s = as->iommu_state; + DMAMap map; /* * Note: all the codes in this function has a assumption that IOVA @@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) VTD_PCI_FUNC(as->devfn), entry.iova, size); + map.iova = entry.iova; + map.size = entry.addr_mask; + iova_tree_remove(as->iova_tree, &map); + memory_region_notify_one(n, &entry); } diff --git a/hw/i386/trace-events b/hw/i386/trace-events index ca23ba9fad..e14d06ec83 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64 +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" -vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 156f35e919..fbfedcb1c0 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -27,6 +27,7 @@ #include "hw/i386/ioapic.h" #include "hw/pci/msi.h" #include "hw/sysbus.h" +#include "qemu/iova-tree.h" #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" #define INTEL_IOMMU_DEVICE(obj) \ @@ -95,6 +96,7 @@ struct VTDAddressSpace { QLIST_ENTRY(VTDAddressSpace) next; /* Superset of notifier flags that this address space has */ IOMMUNotifierFlag notifier_flags; + IOVATree *iova_tree; /* Traces mapped IOVA ranges */ }; struct VTDBus { -- cgit v1.2.3-55-g7522