summaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/assigned-dev.c820
-rw-r--r--virt/kvm/coalesced_mmio.c43
-rw-r--r--virt/kvm/coalesced_mmio.h15
-rw-r--r--virt/kvm/eventfd.c41
-rw-r--r--virt/kvm/ioapic.c118
-rw-r--r--virt/kvm/ioapic.h7
-rw-r--r--virt/kvm/iommu.c36
-rw-r--r--virt/kvm/irq_comm.c235
-rw-r--r--virt/kvm/kvm_main.c1374
10 files changed, 1584 insertions, 1108 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index daece36c0a57..7f1178f6b839 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -12,3 +12,6 @@ config HAVE_KVM_EVENTFD
config KVM_APIC_ARCHITECTURE
bool
+
+config KVM_MMIO
+ bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
new file mode 100644
index 000000000000..057e2cca6af5
--- /dev/null
+++ b/virt/kvm/assigned-dev.c
@@ -0,0 +1,820 @@
+/*
+ * Kernel-based Virtual Machine - device assignment support
+ *
+ * Copyright (C) 2006-9 Red Hat, Inc
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include "irq.h"
+
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+ int assigned_dev_id)
+{
+ struct list_head *ptr;
+ struct kvm_assigned_dev_kernel *match;
+
+ list_for_each(ptr, head) {
+ match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+ if (match->assigned_dev_id == assigned_dev_id)
+ return match;
+ }
+ return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+ *assigned_dev, int irq)
+{
+ int i, index;
+ struct msix_entry *host_msix_entries;
+
+ host_msix_entries = assigned_dev->host_msix_entries;
+
+ index = -1;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ if (irq == host_msix_entries[i].vector) {
+ index = i;
+ break;
+ }
+ if (index < 0) {
+ printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+ return 0;
+ }
+
+ return index;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev;
+ struct kvm *kvm;
+ int i;
+
+ assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+ interrupt_work);
+ kvm = assigned_dev->kvm;
+
+ spin_lock_irq(&assigned_dev->assigned_dev_lock);
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ struct kvm_guest_msix_entry *guest_entries =
+ assigned_dev->guest_msix_entries;
+ for (i = 0; i < assigned_dev->entries_nr; i++) {
+ if (!(guest_entries[i].flags &
+ KVM_ASSIGNED_MSIX_PENDING))
+ continue;
+ guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+ kvm_set_irq(assigned_dev->kvm,
+ assigned_dev->irq_source_id,
+ guest_entries[i].vector, 1);
+ }
+ } else
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 1);
+
+ spin_unlock_irq(&assigned_dev->assigned_dev_lock);
+}
+
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+ unsigned long flags;
+ struct kvm_assigned_dev_kernel *assigned_dev =
+ (struct kvm_assigned_dev_kernel *) dev_id;
+
+ spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ if (index < 0)
+ goto out;
+ assigned_dev->guest_msix_entries[index].flags |=
+ KVM_ASSIGNED_MSIX_PENDING;
+ }
+
+ schedule_work(&assigned_dev->interrupt_work);
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+ disable_irq_nosync(irq);
+ assigned_dev->host_irq_disabled = true;
+ }
+
+out:
+ spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
+ return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_assigned_dev_kernel *dev;
+ unsigned long flags;
+
+ if (kian->gsi == -1)
+ return;
+
+ dev = container_of(kian, struct kvm_assigned_dev_kernel,
+ ack_notifier);
+
+ kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+
+ /* The guest irq may be shared so this ack may be
+ * from another device.
+ */
+ spin_lock_irqsave(&dev->assigned_dev_lock, flags);
+ if (dev->host_irq_disabled) {
+ enable_irq(dev->host_irq);
+ dev->host_irq_disabled = false;
+ }
+ spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+ assigned_dev->ack_notifier.gsi = -1;
+
+ if (assigned_dev->irq_source_id != -1)
+ kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+ assigned_dev->irq_source_id = -1;
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ /*
+ * In kvm_free_device_irq, cancel_work_sync return true if:
+ * 1. work is scheduled, and then cancelled.
+ * 2. work callback is executed.
+ *
+ * The first one ensured that the irq is disabled and no more events
+ * would happen. But for the second one, the irq may be enabled (e.g.
+ * for MSI). So we disable irq here to prevent further events.
+ *
+ * Notice this maybe result in nested disable if the interrupt type is
+ * INTx, but it's OK for we are going to free it.
+ *
+ * If this function is a part of VM destroy, please ensure that till
+ * now, the kvm state is still legal for probably we also have to wait
+ * interrupt_work done.
+ */
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ int i;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ disable_irq_nosync(assigned_dev->
+ host_msix_entries[i].vector);
+
+ cancel_work_sync(&assigned_dev->interrupt_work);
+
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ free_irq(assigned_dev->host_msix_entries[i].vector,
+ (void *)assigned_dev);
+
+ assigned_dev->entries_nr = 0;
+ kfree(assigned_dev->host_msix_entries);
+ kfree(assigned_dev->guest_msix_entries);
+ pci_disable_msix(assigned_dev->dev);
+ } else {
+ /* Deal with MSI and INTx */
+ disable_irq_nosync(assigned_dev->host_irq);
+ cancel_work_sync(&assigned_dev->interrupt_work);
+
+ free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+ pci_disable_msi(assigned_dev->dev);
+ }
+
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev,
+ unsigned long irq_requested_type)
+{
+ unsigned long guest_irq_type, host_irq_type;
+
+ if (!irqchip_in_kernel(kvm))
+ return -EINVAL;
+ /* no irq assignment to deassign */
+ if (!assigned_dev->irq_requested_type)
+ return -ENXIO;
+
+ host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+ guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+ if (host_irq_type)
+ deassign_host_irq(kvm, assigned_dev);
+ if (guest_irq_type)
+ deassign_guest_irq(kvm, assigned_dev);
+
+ return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel
+ *assigned_dev)
+{
+ kvm_free_assigned_irq(kvm, assigned_dev);
+
+ pci_reset_function(assigned_dev->dev);
+
+ pci_release_regions(assigned_dev->dev);
+ pci_disable_device(assigned_dev->dev);
+ pci_dev_put(assigned_dev->dev);
+
+ list_del(&assigned_dev->list);
+ kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+ struct list_head *ptr, *ptr2;
+ struct kvm_assigned_dev_kernel *assigned_dev;
+
+ list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+ assigned_dev = list_entry(ptr,
+ struct kvm_assigned_dev_kernel,
+ list);
+
+ kvm_free_assigned_device(kvm, assigned_dev);
+ }
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ dev->host_irq = dev->dev->irq;
+ /* Even though this is PCI, we don't want to use shared
+ * interrupts. Sharing host devices with guest-assigned devices
+ * on the same interrupt line is not a happy situation: there
+ * are going to be long delays in accepting, acking, etc.
+ */
+ if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+ 0, "kvm_assigned_intx_device", (void *)dev))
+ return -EIO;
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int r;
+
+ if (!dev->dev->msi_enabled) {
+ r = pci_enable_msi(dev->dev);
+ if (r)
+ return r;
+ }
+
+ dev->host_irq = dev->dev->irq;
+ if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+ "kvm_assigned_msi_device", (void *)dev)) {
+ pci_disable_msi(dev->dev);
+ return -EIO;
+ }
+
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int i, r = -EINVAL;
+
+ /* host_msix_entries and guest_msix_entries should have been
+ * initialized */
+ if (dev->entries_nr == 0)
+ return r;
+
+ r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+ if (r)
+ return r;
+
+ for (i = 0; i < dev->entries_nr; i++) {
+ r = request_irq(dev->host_msix_entries[i].vector,
+ kvm_assigned_dev_intr, 0,
+ "kvm_assigned_msix_device",
+ (void *)dev);
+ /* FIXME: free requested_irq's on failure */
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = irq->guest_irq;
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ dev->host_irq_disabled = false;
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ dev->host_irq_disabled = false;
+ return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ __u32 host_irq_type)
+{
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+ return r;
+
+ switch (host_irq_type) {
+ case KVM_DEV_IRQ_HOST_INTX:
+ r = assigned_device_enable_host_intx(kvm, dev);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_HOST_MSI:
+ r = assigned_device_enable_host_msi(kvm, dev);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_HOST_MSIX:
+ r = assigned_device_enable_host_msix(kvm, dev);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+
+ if (!r)
+ dev->irq_requested_type |= host_irq_type;
+
+ return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq,
+ unsigned long guest_irq_type)
+{
+ int id;
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+ return r;
+
+ id = kvm_request_irq_source_id(kvm);
+ if (id < 0)
+ return id;
+
+ dev->irq_source_id = id;
+
+ switch (guest_irq_type) {
+ case KVM_DEV_IRQ_GUEST_INTX:
+ r = assigned_device_enable_guest_intx(kvm, dev, irq);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_GUEST_MSI:
+ r = assigned_device_enable_guest_msi(kvm, dev, irq);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_GUEST_MSIX:
+ r = assigned_device_enable_guest_msix(kvm, dev, irq);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+
+ if (!r) {
+ dev->irq_requested_type |= guest_irq_type;
+ kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+ } else
+ kvm_free_irq_source_id(kvm, dev->irq_source_id);
+
+ return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+ struct kvm_assigned_irq *assigned_irq)
+{
+ int r = -EINVAL;
+ struct kvm_assigned_dev_kernel *match;
+ unsigned long host_irq_type, guest_irq_type;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (!irqchip_in_kernel(kvm))
+ return r;
+
+ mutex_lock(&kvm->lock);
+ r = -ENODEV;
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+ guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+ r = -EINVAL;
+ /* can only assign one type at a time */
+ if (hweight_long(host_irq_type) > 1)
+ goto out;
+ if (hweight_long(guest_irq_type) > 1)
+ goto out;
+ if (host_irq_type == 0 && guest_irq_type == 0)
+ goto out;
+
+ r = 0;
+ if (host_irq_type)
+ r = assign_host_irq(kvm, match, host_irq_type);
+ if (r)
+ goto out;
+
+ if (guest_irq_type)
+ r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+ struct kvm_assigned_irq
+ *assigned_irq)
+{
+ int r = -ENODEV;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0, idx;
+ struct kvm_assigned_dev_kernel *match;
+ struct pci_dev *dev;
+
+ mutex_lock(&kvm->lock);
+ idx = srcu_read_lock(&kvm->srcu);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (match) {
+ /* device already assigned */
+ r = -EEXIST;
+ goto out;
+ }
+
+ match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+ if (match == NULL) {
+ printk(KERN_INFO "%s: Couldn't allocate memory\n",
+ __func__);
+ r = -ENOMEM;
+ goto out;
+ }
+ dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
+ assigned_dev->busnr,
+ assigned_dev->devfn);
+ if (!dev) {
+ printk(KERN_INFO "%s: host device not found\n", __func__);
+ r = -EINVAL;
+ goto out_free;
+ }
+ if (pci_enable_device(dev)) {
+ printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+ r = -EBUSY;
+ goto out_put;
+ }
+ r = pci_request_regions(dev, "kvm_assigned_device");
+ if (r) {
+ printk(KERN_INFO "%s: Could not get access to device regions\n",
+ __func__);
+ goto out_disable;
+ }
+
+ pci_reset_function(dev);
+
+ match->assigned_dev_id = assigned_dev->assigned_dev_id;
+ match->host_segnr = assigned_dev->segnr;
+ match->host_busnr = assigned_dev->busnr;
+ match->host_devfn = assigned_dev->devfn;
+ match->flags = assigned_dev->flags;
+ match->dev = dev;
+ spin_lock_init(&match->assigned_dev_lock);
+ match->irq_source_id = -1;
+ match->kvm = kvm;
+ match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+ INIT_WORK(&match->interrupt_work,
+ kvm_assigned_dev_interrupt_work_handler);
+
+ list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+ if (!kvm->arch.iommu_domain) {
+ r = kvm_iommu_map_guest(kvm);
+ if (r)
+ goto out_list_del;
+ }
+ r = kvm_assign_device(kvm, match);
+ if (r)
+ goto out_list_del;
+ }
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+out_list_del:
+ list_del(&match->list);
+ pci_release_regions(dev);
+out_disable:
+ pci_disable_device(dev);
+out_put:
+ pci_dev_put(dev);
+out_free:
+ kfree(match);
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ printk(KERN_INFO "%s: device hasn't been assigned before, "
+ "so cannot be deassigned\n", __func__);
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+ kvm_deassign_device(kvm, match);
+
+ kvm_free_assigned_device(kvm, match);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+ struct kvm_assigned_msix_nr *entry_nr)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry_nr->assigned_dev_id);
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ if (adev->entries_nr == 0) {
+ adev->entries_nr = entry_nr->entry_nr;
+ if (adev->entries_nr == 0 ||
+ adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+ entry_nr->entry_nr,
+ GFP_KERNEL);
+ if (!adev->host_msix_entries) {
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ adev->guest_msix_entries = kzalloc(
+ sizeof(struct kvm_guest_msix_entry) *
+ entry_nr->entry_nr, GFP_KERNEL);
+ if (!adev->guest_msix_entries) {
+ kfree(adev->host_msix_entries);
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ } else /* Not allowed set MSI-X number twice */
+ r = -EINVAL;
+msix_nr_out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+ struct kvm_assigned_msix_entry *entry)
+{
+ int r = 0, i;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry->assigned_dev_id);
+
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_entry_out;
+ }
+
+ for (i = 0; i < adev->entries_nr; i++)
+ if (adev->guest_msix_entries[i].vector == 0 ||
+ adev->guest_msix_entries[i].entry == entry->entry) {
+ adev->guest_msix_entries[i].entry = entry->entry;
+ adev->guest_msix_entries[i].vector = entry->gsi;
+ adev->host_msix_entries[i].entry = entry->entry;
+ break;
+ }
+ if (i == adev->entries_nr) {
+ r = -ENOSPC;
+ goto msix_entry_out;
+ }
+
+msix_entry_out:
+ mutex_unlock(&kvm->lock);
+
+ return r;
+}
+#endif
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ int r = -ENOTTY;
+
+ switch (ioctl) {
+ case KVM_ASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_IRQ: {
+ r = -EOPNOTSUPP;
+ break;
+ }
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
+ case KVM_ASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_DEASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+#endif
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
+ case KVM_DEASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+#endif
+#ifdef KVM_CAP_IRQ_ROUTING
+ case KVM_SET_GSI_ROUTING: {
+ struct kvm_irq_routing routing;
+ struct kvm_irq_routing __user *urouting;
+ struct kvm_irq_routing_entry *entries;
+
+ r = -EFAULT;
+ if (copy_from_user(&routing, argp, sizeof(routing)))
+ goto out;
+ r = -EINVAL;
+ if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+ goto out;
+ if (routing.flags)
+ goto out;
+ r = -ENOMEM;
+ entries = vmalloc(routing.nr * sizeof(*entries));
+ if (!entries)
+ goto out;
+ r = -EFAULT;
+ urouting = argp;
+ if (copy_from_user(entries, urouting->entries,
+ routing.nr * sizeof(*entries)))
+ goto out_free_irq_routing;
+ r = kvm_set_irq_routing(kvm, entries, routing.nr,
+ routing.flags);
+ out_free_irq_routing:
+ vfree(entries);
+ break;
+ }
+#endif /* KVM_CAP_IRQ_ROUTING */
+#ifdef __KVM_HAVE_MSIX
+ case KVM_ASSIGN_SET_MSIX_NR: {
+ struct kvm_assigned_msix_nr entry_nr;
+ r = -EFAULT;
+ if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_SET_MSIX_ENTRY: {
+ struct kvm_assigned_msix_entry entry;
+ r = -EFAULT;
+ if (copy_from_user(&entry, argp, sizeof entry))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+ if (r)
+ goto out;
+ break;
+ }
+#endif
+ }
+out:
+ return r;
+}
+
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 04d69cd7049b..5169736377a3 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -92,41 +92,64 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
int kvm_coalesced_mmio_init(struct kvm *kvm)
{
struct kvm_coalesced_mmio_dev *dev;
+ struct page *page;
int ret;
+ ret = -ENOMEM;
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto out_err;
+ kvm->coalesced_mmio_ring = page_address(page);
+
+ ret = -ENOMEM;
dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
if (!dev)
- return -ENOMEM;
+ goto out_free_page;
spin_lock_init(&dev->lock);
kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
dev->kvm = kvm;
kvm->coalesced_mmio_dev = dev;
- ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+ mutex_unlock(&kvm->slots_lock);
if (ret < 0)
- kfree(dev);
+ goto out_free_dev;
+
+ return ret;
+out_free_dev:
+ kfree(dev);
+out_free_page:
+ __free_page(page);
+out_err:
return ret;
}
+void kvm_coalesced_mmio_free(struct kvm *kvm)
+{
+ if (kvm->coalesced_mmio_ring)
+ free_page((unsigned long)kvm->coalesced_mmio_ring);
+}
+
int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
- struct kvm_coalesced_mmio_zone *zone)
+ struct kvm_coalesced_mmio_zone *zone)
{
struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
if (dev == NULL)
return -EINVAL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return -ENOBUFS;
}
dev->zone[dev->nb_zones] = *zone;
dev->nb_zones++;
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -140,10 +163,10 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
if (dev == NULL)
return -EINVAL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
i = dev->nb_zones;
- while(i) {
+ while (i) {
z = &dev->zone[i - 1];
/* unregister all zones
@@ -158,7 +181,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
i--;
}
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 4b49f27fa31e..8a5959e3535f 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -1,3 +1,6 @@
+#ifndef __KVM_COALESCED_MMIO_H__
+#define __KVM_COALESCED_MMIO_H__
+
/*
* KVM coalesced MMIO
*
@@ -7,6 +10,8 @@
*
*/
+#ifdef CONFIG_KVM_MMIO
+
#define KVM_COALESCED_MMIO_ZONE_MAX 100
struct kvm_coalesced_mmio_dev {
@@ -18,7 +23,17 @@ struct kvm_coalesced_mmio_dev {
};
int kvm_coalesced_mmio_init(struct kvm *kvm);
+void kvm_coalesced_mmio_free(struct kvm *kvm);
int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *zone);
int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
struct kvm_coalesced_mmio_zone *zone);
+
+#else
+
+static inline int kvm_coalesced_mmio_init(struct kvm *kvm) { return 0; }
+static inline void kvm_coalesced_mmio_free(struct kvm *kvm) { }
+
+#endif
+
+#endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index bb4ebd89b9ff..7016319b1ec0 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -47,7 +47,6 @@ struct _irqfd {
int gsi;
struct list_head list;
poll_table pt;
- wait_queue_head_t *wqh;
wait_queue_t wait;
struct work_struct inject;
struct work_struct shutdown;
@@ -61,10 +60,8 @@ irqfd_inject(struct work_struct *work)
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;
- mutex_lock(&kvm->irq_lock);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
- mutex_unlock(&kvm->irq_lock);
}
/*
@@ -74,12 +71,13 @@ static void
irqfd_shutdown(struct work_struct *work)
{
struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+ u64 cnt;
/*
* Synchronize with the wait-queue and unhook ourselves to prevent
* further events.
*/
- remove_wait_queue(irqfd->wqh, &irqfd->wait);
+ eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
/*
* We know no new events will be scheduled at this point, so block
@@ -160,15 +158,13 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
-
- irqfd->wqh = wqh;
add_wait_queue(wqh, &irqfd->wait);
}
static int
kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
{
- struct _irqfd *irqfd;
+ struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
struct eventfd_ctx *eventfd = NULL;
int ret;
@@ -205,9 +201,20 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ ret = 0;
+ list_for_each_entry(tmp, &kvm->irqfds.items, list) {
+ if (irqfd->eventfd != tmp->eventfd)
+ continue;
+ /* This fd is used for another irq already. */
+ ret = -EBUSY;
+ spin_unlock_irq(&kvm->irqfds.lock);
+ goto fail;
+ }
+
events = file->f_op->poll(file, &irqfd->pt);
- spin_lock_irq(&kvm->irqfds.lock);
list_add_tail(&irqfd->list, &kvm->irqfds.items);
spin_unlock_irq(&kvm->irqfds.lock);
@@ -453,7 +460,7 @@ static int
kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
- struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
+ enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p;
struct eventfd_ctx *eventfd;
int ret;
@@ -498,7 +505,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
else
p->wildcard = true;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
/* Verify that there isnt a match already */
if (ioeventfd_check_collision(kvm, p)) {
@@ -508,18 +515,18 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
kvm_iodevice_init(&p->dev, &ioeventfd_ops);
- ret = __kvm_io_bus_register_dev(bus, &p->dev);
+ ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
if (ret < 0)
goto unlock_fail;
list_add_tail(&p->list, &kvm->ioeventfds);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
unlock_fail:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
fail:
kfree(p);
@@ -532,7 +539,7 @@ static int
kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
- struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
+ enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p, *tmp;
struct eventfd_ctx *eventfd;
int ret = -ENOENT;
@@ -541,7 +548,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
@@ -555,13 +562,13 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (!p->wildcard && p->datamatch != args->datamatch)
continue;
- __kvm_io_bus_unregister_dev(bus, &p->dev);
+ kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
ioeventfd_release(p);
ret = 0;
break;
}
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
eventfd_ctx_put(eventfd);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 9fe140bb38ec..3db15a807f80 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -100,6 +100,19 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
return injected;
}
+static void update_handled_vectors(struct kvm_ioapic *ioapic)
+{
+ DECLARE_BITMAP(handled_vectors, 256);
+ int i;
+
+ memset(handled_vectors, 0, sizeof(handled_vectors));
+ for (i = 0; i < IOAPIC_NUM_PINS; ++i)
+ __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
+ memcpy(ioapic->handled_vectors, handled_vectors,
+ sizeof(handled_vectors));
+ smp_wmb();
+}
+
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@@ -134,6 +147,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
e->bits |= (u32) val;
e->fields.remote_irr = 0;
}
+ update_handled_vectors(ioapic);
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
@@ -182,6 +196,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
union kvm_ioapic_redirect_entry entry;
int ret = 1;
+ mutex_lock(&ioapic->lock);
if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
entry = ioapic->redirtbl[irq];
level ^= entry.fields.polarity;
@@ -198,34 +213,54 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
}
trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
}
+ mutex_unlock(&ioapic->lock);
+
return ret;
}
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
- int trigger_mode)
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
+ int trigger_mode)
{
- union kvm_ioapic_redirect_entry *ent;
+ int i;
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
- ent = &ioapic->redirtbl[pin];
+ if (ent->fields.vector != vector)
+ continue;
- kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ mutex_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+ mutex_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG)
+ continue;
- if (trigger_mode == IOAPIC_LEVEL_TRIG) {
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent->fields.remote_irr = 0;
- if (!ent->fields.mask && (ioapic->irr & (1 << pin)))
- ioapic_service(ioapic, pin);
+ if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+ ioapic_service(ioapic, i);
}
}
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- int i;
- for (i = 0; i < IOAPIC_NUM_PINS; i++)
- if (ioapic->redirtbl[i].fields.vector == vector)
- __kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
+ smp_rmb();
+ if (!test_bit(vector, ioapic->handled_vectors))
+ return;
+ mutex_lock(&ioapic->lock);
+ __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
+ mutex_unlock(&ioapic->lock);
}
static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
@@ -250,8 +285,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
ioapic_debug("addr %lx\n", (unsigned long)addr);
ASSERT(!(addr & 0xf)); /* check alignment */
- mutex_lock(&ioapic->kvm->irq_lock);
addr &= 0xff;
+ mutex_lock(&ioapic->lock);
switch (addr) {
case IOAPIC_REG_SELECT:
result = ioapic->ioregsel;
@@ -265,6 +300,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
result = 0;
break;
}
+ mutex_unlock(&ioapic->lock);
+
switch (len) {
case 8:
*(u64 *) val = result;
@@ -277,7 +314,6 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
default:
printk(KERN_WARNING "ioapic: wrong length %d\n", len);
}
- mutex_unlock(&ioapic->kvm->irq_lock);
return 0;
}
@@ -293,15 +329,15 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
(void*)addr, len, val);
ASSERT(!(addr & 0xf)); /* check alignment */
- mutex_lock(&ioapic->kvm->irq_lock);
if (len == 4 || len == 8)
data = *(u32 *) val;
else {
printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
- goto unlock;
+ return 0;
}
addr &= 0xff;
+ mutex_lock(&ioapic->lock);
switch (addr) {
case IOAPIC_REG_SELECT:
ioapic->ioregsel = data;
@@ -312,15 +348,14 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
break;
#ifdef CONFIG_IA64
case IOAPIC_REG_EOI:
- kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG);
+ __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG);
break;
#endif
default:
break;
}
-unlock:
- mutex_unlock(&ioapic->kvm->irq_lock);
+ mutex_unlock(&ioapic->lock);
return 0;
}
@@ -334,6 +369,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->ioregsel = 0;
ioapic->irr = 0;
ioapic->id = 0;
+ update_handled_vectors(ioapic);
}
static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -349,14 +385,54 @@ int kvm_ioapic_init(struct kvm *kvm)
ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
if (!ioapic)
return -ENOMEM;
+ mutex_init(&ioapic->lock);
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
ioapic->kvm = kvm;
- ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev);
- if (ret < 0)
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
kfree(ioapic);
+ }
return ret;
}
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (ioapic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+}
+
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ mutex_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ mutex_unlock(&ioapic->lock);
+ return 0;
+}
+
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ mutex_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ update_handled_vectors(ioapic);
+ mutex_unlock(&ioapic->lock);
+ return 0;
+}
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7080b713c160..8a751b78a430 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -41,9 +41,12 @@ struct kvm_ioapic {
u32 irr;
u32 pad;
union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
+ struct mutex lock;
+ DECLARE_BITMAP(handled_vectors, 256);
};
#ifdef DEBUG
@@ -69,8 +72,12 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq);
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+
#endif
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 15147583abd1..80fd3ad3b2de 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -32,10 +32,10 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-int kvm_iommu_map_pages(struct kvm *kvm,
- gfn_t base_gfn, unsigned long npages)
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
{
- gfn_t gfn = base_gfn;
+ gfn_t gfn = slot->base_gfn;
+ unsigned long npages = slot->npages;
pfn_t pfn;
int i, r = 0;
struct iommu_domain *domain = kvm->arch.iommu_domain;
@@ -54,7 +54,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
continue;
- pfn = gfn_to_pfn(kvm, gfn);
+ pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
r = iommu_map_range(domain,
gfn_to_gpa(gfn),
pfn_to_hpa(pfn),
@@ -69,17 +69,19 @@ int kvm_iommu_map_pages(struct kvm *kvm,
return 0;
unmap_pages:
- kvm_iommu_put_pages(kvm, base_gfn, i);
+ kvm_iommu_put_pages(kvm, slot->base_gfn, i);
return r;
}
static int kvm_iommu_map_memslots(struct kvm *kvm)
{
int i, r = 0;
+ struct kvm_memslots *slots;
+
+ slots = rcu_dereference(kvm->memslots);
- for (i = 0; i < kvm->nmemslots; i++) {
- r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
- kvm->memslots[i].npages);
+ for (i = 0; i < slots->nmemslots; i++) {
+ r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
if (r)
break;
}
@@ -104,7 +106,8 @@ int kvm_assign_device(struct kvm *kvm,
r = iommu_attach_device(domain, &pdev->dev);
if (r) {
- printk(KERN_ERR "assign device %x:%x.%x failed",
+ printk(KERN_ERR "assign device %x:%x:%x.%x failed",
+ pci_domain_nr(pdev->bus),
pdev->bus->number,
PCI_SLOT(pdev->devfn),
PCI_FUNC(pdev->devfn));
@@ -125,7 +128,8 @@ int kvm_assign_device(struct kvm *kvm,
goto out_unmap;
}
- printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
+ printk(KERN_DEBUG "assign device %x:%x:%x.%x\n",
+ assigned_dev->host_segnr,
assigned_dev->host_busnr,
PCI_SLOT(assigned_dev->host_devfn),
PCI_FUNC(assigned_dev->host_devfn));
@@ -152,7 +156,8 @@ int kvm_deassign_device(struct kvm *kvm,
iommu_detach_device(domain, &pdev->dev);
- printk(KERN_DEBUG "deassign device: host bdf = %x:%x:%x\n",
+ printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n",
+ assigned_dev->host_segnr,
assigned_dev->host_busnr,
PCI_SLOT(assigned_dev->host_devfn),
PCI_FUNC(assigned_dev->host_devfn));
@@ -210,10 +215,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
static int kvm_iommu_unmap_memslots(struct kvm *kvm)
{
int i;
+ struct kvm_memslots *slots;
+
+ slots = rcu_dereference(kvm->memslots);
- for (i = 0; i < kvm->nmemslots; i++) {
- kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
- kvm->memslots[i].npages);
+ for (i = 0; i < slots->nmemslots; i++) {
+ kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
+ slots->memslots[i].npages);
}
return 0;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 001663ff401a..9fd5b3ebc517 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -31,20 +31,39 @@
#include "ioapic.h"
+static inline int kvm_irq_line_state(unsigned long *irq_state,
+ int irq_source_id, int level)
+{
+ /* Logical OR for level trig interrupt */
+ if (level)
+ set_bit(irq_source_id, irq_state);
+ else
+ clear_bit(irq_source_id, irq_state);
+
+ return !!(*irq_state);
+}
+
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int level)
+ struct kvm *kvm, int irq_source_id, int level)
{
#ifdef CONFIG_X86
- return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level);
+ struct kvm_pic *pic = pic_irqchip(kvm);
+ level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
+ irq_source_id, level);
+ return kvm_pic_set_irq(pic, e->irqchip.pin, level);
#else
return -1;
#endif
}
static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int level)
+ struct kvm *kvm, int irq_source_id, int level)
{
- return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
+ irq_source_id, level);
+
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
}
inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -63,8 +82,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
int i, r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
- WARN_ON(!mutex_is_locked(&kvm->irq_lock));
-
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
kvm_is_dm_lowest_prio(irq))
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
@@ -96,10 +113,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
}
static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int level)
+ struct kvm *kvm, int irq_source_id, int level)
{
struct kvm_lapic_irq irq;
+ if (!level)
+ return -1;
+
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
irq.dest_id = (e->msi.address_lo &
@@ -116,78 +136,67 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
}
-/* This should be called with the kvm->irq_lock mutex held
+/*
* Return value:
* < 0 Interrupt was ignored (masked or not delivered for other reasons)
* = 0 Interrupt was coalesced (previous irq is still pending)
* > 0 Number of CPUs interrupt was delivered to
*/
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
{
- struct kvm_kernel_irq_routing_entry *e;
- unsigned long *irq_state, sig_level;
- int ret = -1;
+ struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
+ int ret = -1, i = 0;
+ struct kvm_irq_routing_table *irq_rt;
+ struct hlist_node *n;
trace_kvm_set_irq(irq, level, irq_source_id);
- WARN_ON(!mutex_is_locked(&kvm->irq_lock));
-
- if (irq < KVM_IOAPIC_NUM_PINS) {
- irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
-
- /* Logical OR for level trig interrupt */
- if (level)
- set_bit(irq_source_id, irq_state);
- else
- clear_bit(irq_source_id, irq_state);
- sig_level = !!(*irq_state);
- } else if (!level)
- return ret;
- else /* Deal with MSI/MSI-X */
- sig_level = 1;
-
/* Not possible to detect if the guest uses the PIC or the
* IOAPIC. So set the bit in both. The guest will ignore
* writes to the unused one.
*/
- list_for_each_entry(e, &kvm->irq_routing, link)
- if (e->gsi == irq) {
- int r = e->set(e, kvm, sig_level);
- if (r < 0)
- continue;
+ rcu_read_lock();
+ irq_rt = rcu_dereference(kvm->irq_routing);
+ if (irq < irq_rt->nr_rt_entries)
+ hlist_for_each_entry(e, n, &irq_rt->map[irq], link)
+ irq_set[i++] = *e;
+ rcu_read_unlock();
+
+ while(i--) {
+ int r;
+ r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
+ if (r < 0)
+ continue;
+
+ ret = r + ((ret < 0) ? 0 : ret);
+ }
- ret = r + ((ret < 0) ? 0 : ret);
- }
return ret;
}
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
- struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_ack_notifier *kian;
struct hlist_node *n;
- unsigned gsi = pin;
+ int gsi;
trace_kvm_ack_irq(irqchip, pin);
- list_for_each_entry(e, &kvm->irq_routing, link)
- if (e->type == KVM_IRQ_ROUTING_IRQCHIP &&
- e->irqchip.irqchip == irqchip &&
- e->irqchip.pin == pin) {
- gsi = e->gsi;
- break;
- }
-
- hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
- if (kian->gsi == gsi)
- kian->irq_acked(kian);
+ rcu_read_lock();
+ gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list,
+ link)
+ if (kian->gsi == gsi)
+ kian->irq_acked(kian);
+ rcu_read_unlock();
}
void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian)
{
mutex_lock(&kvm->irq_lock);
- hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+ hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
mutex_unlock(&kvm->irq_lock);
}
@@ -195,8 +204,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian)
{
mutex_lock(&kvm->irq_lock);
- hlist_del_init(&kian->link);
+ hlist_del_init_rcu(&kian->link);
mutex_unlock(&kvm->irq_lock);
+ synchronize_rcu();
}
int kvm_request_irq_source_id(struct kvm *kvm)
@@ -205,16 +215,17 @@ int kvm_request_irq_source_id(struct kvm *kvm)
int irq_source_id;
mutex_lock(&kvm->irq_lock);
- irq_source_id = find_first_zero_bit(bitmap,
- sizeof(kvm->arch.irq_sources_bitmap));
+ irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
- if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
+ if (irq_source_id >= BITS_PER_LONG) {
printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
- return -EFAULT;
+ irq_source_id = -EFAULT;
+ goto unlock;
}
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
set_bit(irq_source_id, bitmap);
+unlock:
mutex_unlock(&kvm->irq_lock);
return irq_source_id;
@@ -228,13 +239,23 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
- irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
+ irq_source_id >= BITS_PER_LONG) {
printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
- return;
+ goto unlock;
}
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
- clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+ if (!irqchip_in_kernel(kvm))
+ goto unlock;
+
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
+ clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
+ if (i >= 16)
+ continue;
+#ifdef CONFIG_X86
+ clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
+#endif
+ }
+unlock:
mutex_unlock(&kvm->irq_lock);
}
@@ -243,7 +264,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
{
mutex_lock(&kvm->irq_lock);
kimn->irq = irq;
- hlist_add_head(&kimn->link, &kvm->mask_notifier_list);
+ hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list);
mutex_unlock(&kvm->irq_lock);
}
@@ -251,8 +272,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn)
{
mutex_lock(&kvm->irq_lock);
- hlist_del(&kimn->link);
+ hlist_del_rcu(&kimn->link);
mutex_unlock(&kvm->irq_lock);
+ synchronize_rcu();
}
void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -260,33 +282,38 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
struct kvm_irq_mask_notifier *kimn;
struct hlist_node *n;
- WARN_ON(!mutex_is_locked(&kvm->irq_lock));
-
- hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
if (kimn->irq == irq)
kimn->func(kimn, mask);
-}
-
-static void __kvm_free_irq_routing(struct list_head *irq_routing)
-{
- struct kvm_kernel_irq_routing_entry *e, *n;
-
- list_for_each_entry_safe(e, n, irq_routing, link)
- kfree(e);
+ rcu_read_unlock();
}
void kvm_free_irq_routing(struct kvm *kvm)
{
- mutex_lock(&kvm->irq_lock);
- __kvm_free_irq_routing(&kvm->irq_routing);
- mutex_unlock(&kvm->irq_lock);
+ /* Called only during vm destruction. Nobody can use the pointer
+ at this stage */
+ kfree(kvm->irq_routing);
}
-static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
int delta;
+ unsigned max_pin;
+ struct kvm_kernel_irq_routing_entry *ei;
+ struct hlist_node *n;
+
+ /*
+ * Do not allow GSI to be mapped to the same irqchip more than once.
+ * Allow only one to one mapping between GSI and MSI.
+ */
+ hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
+ if (ei->type == KVM_IRQ_ROUTING_MSI ||
+ ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+ return r;
e->gsi = ue->gsi;
e->type = ue->type;
@@ -296,12 +323,15 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
+ max_pin = 16;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
+ max_pin = 16;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
+ max_pin = KVM_IOAPIC_NUM_PINS;
e->set = kvm_set_ioapic_irq;
break;
default:
@@ -309,6 +339,9 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
e->irqchip.pin = ue->u.irqchip.pin + delta;
+ if (e->irqchip.pin >= max_pin)
+ goto out;
+ rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
break;
case KVM_IRQ_ROUTING_MSI:
e->set = kvm_set_msi;
@@ -319,6 +352,8 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
default:
goto out;
}
+
+ hlist_add_head(&e->link, &rt->map[e->gsi]);
r = 0;
out:
return r;
@@ -330,43 +365,53 @@ int kvm_set_irq_routing(struct kvm *kvm,
unsigned nr,
unsigned flags)
{
- struct list_head irq_list = LIST_HEAD_INIT(irq_list);
- struct list_head tmp = LIST_HEAD_INIT(tmp);
- struct kvm_kernel_irq_routing_entry *e = NULL;
- unsigned i;
+ struct kvm_irq_routing_table *new, *old;
+ u32 i, j, nr_rt_entries = 0;
int r;
for (i = 0; i < nr; ++i) {
+ if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+ return -EINVAL;
+ nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+ }
+
+ nr_rt_entries += 1;
+
+ new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+ + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+ GFP_KERNEL);
+
+ if (!new)
+ return -ENOMEM;
+
+ new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+ new->nr_rt_entries = nr_rt_entries;
+ for (i = 0; i < 3; i++)
+ for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
+ new->chip[i][j] = -1;
+
+ for (i = 0; i < nr; ++i) {
r = -EINVAL;
- if (ue->gsi >= KVM_MAX_IRQ_ROUTES)
- goto out;
if (ue->flags)
goto out;
- r = -ENOMEM;
- e = kzalloc(sizeof(*e), GFP_KERNEL);
- if (!e)
- goto out;
- r = setup_routing_entry(e, ue);
+ r = setup_routing_entry(new, &new->rt_entries[i], ue);
if (r)
goto out;
++ue;
- list_add(&e->link, &irq_list);
- e = NULL;
}
mutex_lock(&kvm->irq_lock);
- list_splice(&kvm->irq_routing, &tmp);
- INIT_LIST_HEAD(&kvm->irq_routing);
- list_splice(&irq_list, &kvm->irq_routing);
- INIT_LIST_HEAD(&irq_list);
- list_splice(&tmp, &irq_list);
+ old = kvm->irq_routing;
+ rcu_assign_pointer(kvm->irq_routing, new);
mutex_unlock(&kvm->irq_lock);
+ synchronize_rcu();
+ new = old;
r = 0;
out:
- kfree(e);
- __kvm_free_irq_routing(&irq_list);
+ kfree(new);
return r;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5e7e3f1183f..548f9253c195 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -43,21 +43,17 @@
#include <linux/swap.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
+#include <linux/compat.h>
+#include <linux/srcu.h>
+#include <linux/hugetlb.h>
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#include <asm-generic/bitops/le.h>
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
#include "coalesced_mmio.h"
-#endif
-
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include "irq.h"
-#endif
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
@@ -68,13 +64,15 @@ MODULE_LICENSE("GPL");
/*
* Ordering of locks:
*
- * kvm->slots_lock --> kvm->lock --> kvm->irq_lock
+ * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/
DEFINE_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list);
static cpumask_var_t cpus_hardware_enabled;
+static int kvm_usage_count = 0;
+static atomic_t hardware_enable_failed;
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
@@ -85,615 +83,15 @@ struct dentry *kvm_debugfs_dir;
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
+static int hardware_enable_all(void);
+static void hardware_disable_all(void);
+
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
static bool kvm_rebooting;
static bool largepages_enabled = true;
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
- int assigned_dev_id)
-{
- struct list_head *ptr;
- struct kvm_assigned_dev_kernel *match;
-
- list_for_each(ptr, head) {
- match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
- if (match->assigned_dev_id == assigned_dev_id)
- return match;
- }
- return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
- *assigned_dev, int irq)
-{
- int i, index;
- struct msix_entry *host_msix_entries;
-
- host_msix_entries = assigned_dev->host_msix_entries;
-
- index = -1;
- for (i = 0; i < assigned_dev->entries_nr; i++)
- if (irq == host_msix_entries[i].vector) {
- index = i;
- break;
- }
- if (index < 0) {
- printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
- return 0;
- }
-
- return index;
-}
-
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
-{
- struct kvm_assigned_dev_kernel *assigned_dev;
- struct kvm *kvm;
- int i;
-
- assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
- interrupt_work);
- kvm = assigned_dev->kvm;
-
- mutex_lock(&kvm->irq_lock);
- spin_lock_irq(&assigned_dev->assigned_dev_lock);
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
- struct kvm_guest_msix_entry *guest_entries =
- assigned_dev->guest_msix_entries;
- for (i = 0; i < assigned_dev->entries_nr; i++) {
- if (!(guest_entries[i].flags &
- KVM_ASSIGNED_MSIX_PENDING))
- continue;
- guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
- kvm_set_irq(assigned_dev->kvm,
- assigned_dev->irq_source_id,
- guest_entries[i].vector, 1);
- }
- } else
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- assigned_dev->guest_irq, 1);
-
- spin_unlock_irq(&assigned_dev->assigned_dev_lock);
- mutex_unlock(&assigned_dev->kvm->irq_lock);
-}
-
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
- unsigned long flags;
- struct kvm_assigned_dev_kernel *assigned_dev =
- (struct kvm_assigned_dev_kernel *) dev_id;
-
- spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
- int index = find_index_from_host_irq(assigned_dev, irq);
- if (index < 0)
- goto out;
- assigned_dev->guest_msix_entries[index].flags |=
- KVM_ASSIGNED_MSIX_PENDING;
- }
-
- schedule_work(&assigned_dev->interrupt_work);
-
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
- disable_irq_nosync(irq);
- assigned_dev->host_irq_disabled = true;
- }
-
-out:
- spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
- return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
- struct kvm_assigned_dev_kernel *dev;
- unsigned long flags;
-
- if (kian->gsi == -1)
- return;
-
- dev = container_of(kian, struct kvm_assigned_dev_kernel,
- ack_notifier);
-
- kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-
- /* The guest irq may be shared so this ack may be
- * from another device.
- */
- spin_lock_irqsave(&dev->assigned_dev_lock, flags);
- if (dev->host_irq_disabled) {
- enable_irq(dev->host_irq);
- dev->host_irq_disabled = false;
- }
- spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev)
-{
- kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
- assigned_dev->ack_notifier.gsi = -1;
-
- if (assigned_dev->irq_source_id != -1)
- kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
- assigned_dev->irq_source_id = -1;
- assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev)
-{
- /*
- * In kvm_free_device_irq, cancel_work_sync return true if:
- * 1. work is scheduled, and then cancelled.
- * 2. work callback is executed.
- *
- * The first one ensured that the irq is disabled and no more events
- * would happen. But for the second one, the irq may be enabled (e.g.
- * for MSI). So we disable irq here to prevent further events.
- *
- * Notice this maybe result in nested disable if the interrupt type is
- * INTx, but it's OK for we are going to free it.
- *
- * If this function is a part of VM destroy, please ensure that till
- * now, the kvm state is still legal for probably we also have to wait
- * interrupt_work done.
- */
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
- int i;
- for (i = 0; i < assigned_dev->entries_nr; i++)
- disable_irq_nosync(assigned_dev->
- host_msix_entries[i].vector);
-
- cancel_work_sync(&assigned_dev->interrupt_work);
-
- for (i = 0; i < assigned_dev->entries_nr; i++)
- free_irq(assigned_dev->host_msix_entries[i].vector,
- (void *)assigned_dev);
-
- assigned_dev->entries_nr = 0;
- kfree(assigned_dev->host_msix_entries);
- kfree(assigned_dev->guest_msix_entries);
- pci_disable_msix(assigned_dev->dev);
- } else {
- /* Deal with MSI and INTx */
- disable_irq_nosync(assigned_dev->host_irq);
- cancel_work_sync(&assigned_dev->interrupt_work);
-
- free_irq(assigned_dev->host_irq, (void *)assigned_dev);
-
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
- pci_disable_msi(assigned_dev->dev);
- }
-
- assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev,
- unsigned long irq_requested_type)
-{
- unsigned long guest_irq_type, host_irq_type;
-
- if (!irqchip_in_kernel(kvm))
- return -EINVAL;
- /* no irq assignment to deassign */
- if (!assigned_dev->irq_requested_type)
- return -ENXIO;
-
- host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
- guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
- if (host_irq_type)
- deassign_host_irq(kvm, assigned_dev);
- if (guest_irq_type)
- deassign_guest_irq(kvm, assigned_dev);
-
- return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *assigned_dev)
-{
- kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
- struct kvm_assigned_dev_kernel
- *assigned_dev)
-{
- kvm_free_assigned_irq(kvm, assigned_dev);
-
- pci_reset_function(assigned_dev->dev);
-
- pci_release_regions(assigned_dev->dev);
- pci_disable_device(assigned_dev->dev);
- pci_dev_put(assigned_dev->dev);
-
- list_del(&assigned_dev->list);
- kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
- struct list_head *ptr, *ptr2;
- struct kvm_assigned_dev_kernel *assigned_dev;
-
- list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
- assigned_dev = list_entry(ptr,
- struct kvm_assigned_dev_kernel,
- list);
-
- kvm_free_assigned_device(kvm, assigned_dev);
- }
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev)
-{
- dev->host_irq = dev->dev->irq;
- /* Even though this is PCI, we don't want to use shared
- * interrupts. Sharing host devices with guest-assigned devices
- * on the same interrupt line is not a happy situation: there
- * are going to be long delays in accepting, acking, etc.
- */
- if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
- 0, "kvm_assigned_intx_device", (void *)dev))
- return -EIO;
- return 0;
-}
-
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_host_msi(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev)
-{
- int r;
-
- if (!dev->dev->msi_enabled) {
- r = pci_enable_msi(dev->dev);
- if (r)
- return r;
- }
-
- dev->host_irq = dev->dev->irq;
- if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
- "kvm_assigned_msi_device", (void *)dev)) {
- pci_disable_msi(dev->dev);
- return -EIO;
- }
-
- return 0;
-}
-#endif
-
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_host_msix(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev)
-{
- int i, r = -EINVAL;
-
- /* host_msix_entries and guest_msix_entries should have been
- * initialized */
- if (dev->entries_nr == 0)
- return r;
-
- r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
- if (r)
- return r;
-
- for (i = 0; i < dev->entries_nr; i++) {
- r = request_irq(dev->host_msix_entries[i].vector,
- kvm_assigned_dev_intr, 0,
- "kvm_assigned_msix_device",
- (void *)dev);
- /* FIXME: free requested_irq's on failure */
- if (r)
- return r;
- }
-
- return 0;
-}
-
-#endif
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq)
-{
- dev->guest_irq = irq->guest_irq;
- dev->ack_notifier.gsi = irq->guest_irq;
- return 0;
-}
-
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq)
-{
- dev->guest_irq = irq->guest_irq;
- dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
- return 0;
-}
-#endif
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq)
-{
- dev->guest_irq = irq->guest_irq;
- dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
- return 0;
-}
-#endif
-
-static int assign_host_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- __u32 host_irq_type)
-{
- int r = -EEXIST;
-
- if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
- return r;
-
- switch (host_irq_type) {
- case KVM_DEV_IRQ_HOST_INTX:
- r = assigned_device_enable_host_intx(kvm, dev);
- break;
-#ifdef __KVM_HAVE_MSI
- case KVM_DEV_IRQ_HOST_MSI:
- r = assigned_device_enable_host_msi(kvm, dev);
- break;
-#endif
-#ifdef __KVM_HAVE_MSIX
- case KVM_DEV_IRQ_HOST_MSIX:
- r = assigned_device_enable_host_msix(kvm, dev);
- break;
-#endif
- default:
- r = -EINVAL;
- }
-
- if (!r)
- dev->irq_requested_type |= host_irq_type;
-
- return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
- struct kvm_assigned_dev_kernel *dev,
- struct kvm_assigned_irq *irq,
- unsigned long guest_irq_type)
-{
- int id;
- int r = -EEXIST;
-
- if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
- return r;
-
- id = kvm_request_irq_source_id(kvm);
- if (id < 0)
- return id;
-
- dev->irq_source_id = id;
-
- switch (guest_irq_type) {
- case KVM_DEV_IRQ_GUEST_INTX:
- r = assigned_device_enable_guest_intx(kvm, dev, irq);
- break;
-#ifdef __KVM_HAVE_MSI
- case KVM_DEV_IRQ_GUEST_MSI:
- r = assigned_device_enable_guest_msi(kvm, dev, irq);
- break;
-#endif
-#ifdef __KVM_HAVE_MSIX
- case KVM_DEV_IRQ_GUEST_MSIX:
- r = assigned_device_enable_guest_msix(kvm, dev, irq);
- break;
-#endif
- default:
- r = -EINVAL;
- }
-
- if (!r) {
- dev->irq_requested_type |= guest_irq_type;
- kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
- } else
- kvm_free_irq_source_id(kvm, dev->irq_source_id);
-
- return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
- struct kvm_assigned_irq *assigned_irq)
-{
- int r = -EINVAL;
- struct kvm_assigned_dev_kernel *match;
- unsigned long host_irq_type, guest_irq_type;
-
- if (!capable(CAP_SYS_RAWIO))
- return -EPERM;
-
- if (!irqchip_in_kernel(kvm))
- return r;
-
- mutex_lock(&kvm->lock);
- r = -ENODEV;
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_irq->assigned_dev_id);
- if (!match)
- goto out;
-
- host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
- guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
- r = -EINVAL;
- /* can only assign one type at a time */
- if (hweight_long(host_irq_type) > 1)
- goto out;
- if (hweight_long(guest_irq_type) > 1)
- goto out;
- if (host_irq_type == 0 && guest_irq_type == 0)
- goto out;
-
- r = 0;
- if (host_irq_type)
- r = assign_host_irq(kvm, match, host_irq_type);
- if (r)
- goto out;
-
- if (guest_irq_type)
- r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
- struct kvm_assigned_irq
- *assigned_irq)
-{
- int r = -ENODEV;
- struct kvm_assigned_dev_kernel *match;
-
- mutex_lock(&kvm->lock);
-
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_irq->assigned_dev_id);
- if (!match)
- goto out;
-
- r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
- struct kvm_assigned_pci_dev *assigned_dev)
-{
- int r = 0;
- struct kvm_assigned_dev_kernel *match;
- struct pci_dev *dev;
-
- down_read(&kvm->slots_lock);
- mutex_lock(&kvm->lock);
-
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_dev->assigned_dev_id);
- if (match) {
- /* device already assigned */
- r = -EEXIST;
- goto out;
- }
-
- match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
- if (match == NULL) {
- printk(KERN_INFO "%s: Couldn't allocate memory\n",
- __func__);
- r = -ENOMEM;
- goto out;
- }
- dev = pci_get_bus_and_slot(assigned_dev->busnr,
- assigned_dev->devfn);
- if (!dev) {
- printk(KERN_INFO "%s: host device not found\n", __func__);
- r = -EINVAL;
- goto out_free;
- }
- if (pci_enable_device(dev)) {
- printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
- r = -EBUSY;
- goto out_put;
- }
- r = pci_request_regions(dev, "kvm_assigned_device");
- if (r) {
- printk(KERN_INFO "%s: Could not get access to device regions\n",
- __func__);
- goto out_disable;
- }
-
- pci_reset_function(dev);
-
- match->assigned_dev_id = assigned_dev->assigned_dev_id;
- match->host_busnr = assigned_dev->busnr;
- match->host_devfn = assigned_dev->devfn;
- match->flags = assigned_dev->flags;
- match->dev = dev;
- spin_lock_init(&match->assigned_dev_lock);
- match->irq_source_id = -1;
- match->kvm = kvm;
- match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
- INIT_WORK(&match->interrupt_work,
- kvm_assigned_dev_interrupt_work_handler);
-
- list_add(&match->list, &kvm->arch.assigned_dev_head);
-
- if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
- if (!kvm->arch.iommu_domain) {
- r = kvm_iommu_map_guest(kvm);
- if (r)
- goto out_list_del;
- }
- r = kvm_assign_device(kvm, match);
- if (r)
- goto out_list_del;
- }
-
-out:
- mutex_unlock(&kvm->lock);
- up_read(&kvm->slots_lock);
- return r;
-out_list_del:
- list_del(&match->list);
- pci_release_regions(dev);
-out_disable:
- pci_disable_device(dev);
-out_put:
- pci_dev_put(dev);
-out_free:
- kfree(match);
- mutex_unlock(&kvm->lock);
- up_read(&kvm->slots_lock);
- return r;
-}
-#endif
-
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
- struct kvm_assigned_pci_dev *assigned_dev)
-{
- int r = 0;
- struct kvm_assigned_dev_kernel *match;
-
- mutex_lock(&kvm->lock);
-
- match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- assigned_dev->assigned_dev_id);
- if (!match) {
- printk(KERN_INFO "%s: device hasn't been assigned before, "
- "so cannot be deassigned\n", __func__);
- r = -EINVAL;
- goto out;
- }
-
- if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
- kvm_deassign_device(kvm, match);
-
- kvm_free_assigned_device(kvm, match);
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-#endif
-
inline int kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
@@ -740,7 +138,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
- spin_lock(&kvm->requests_lock);
+ raw_spin_lock(&kvm->requests_lock);
me = smp_processor_id();
kvm_for_each_vcpu(i, vcpu, kvm) {
if (test_and_set_bit(req, &vcpu->requests))
@@ -755,7 +153,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
smp_call_function_many(cpus, ack_flush, NULL, 1);
else
called = false;
- spin_unlock(&kvm->requests_lock);
+ raw_spin_unlock(&kvm->requests_lock);
free_cpumask_var(cpus);
return called;
}
@@ -819,7 +217,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
unsigned long address)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int need_tlb_flush;
+ int need_tlb_flush, idx;
/*
* When ->invalidate_page runs, the linux pte has been zapped
@@ -839,10 +237,12 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
* pte after kvm_unmap_hva returned, without noticing the page
* is going to be freed.
*/
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
need_tlb_flush = kvm_unmap_hva(kvm, address);
spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -850,14 +250,31 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
}
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+ kvm->mmu_notifier_seq++;
+ kvm_set_spte_hva(kvm, address, pte);
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int need_tlb_flush = 0;
+ int need_tlb_flush = 0, idx;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
/*
* The count increase must become visible at unlock time as no
@@ -868,6 +285,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -905,11 +323,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
unsigned long address)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int young;
+ int young, idx;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
young = kvm_age_hva(kvm, address);
spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
if (young)
kvm_flush_remote_tlbs(kvm);
@@ -929,59 +349,71 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+ .change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,
};
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+ kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+}
+
+#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+
+static int kvm_init_mmu_notifier(struct kvm *kvm)
+{
+ return 0;
+}
+
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
static struct kvm *kvm_create_vm(void)
{
+ int r = 0, i;
struct kvm *kvm = kvm_arch_create_vm();
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- struct page *page;
-#endif
if (IS_ERR(kvm))
goto out;
+
+ r = hardware_enable_all();
+ if (r)
+ goto out_err_nodisable;
+
#ifdef CONFIG_HAVE_KVM_IRQCHIP
- INIT_LIST_HEAD(&kvm->irq_routing);
INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+ INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- kfree(kvm);
- return ERR_PTR(-ENOMEM);
+ r = -ENOMEM;
+ kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!kvm->memslots)
+ goto out_err;
+ if (init_srcu_struct(&kvm->srcu))
+ goto out_err;
+ for (i = 0; i < KVM_NR_BUSES; i++) {
+ kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
+ GFP_KERNEL);
+ if (!kvm->buses[i]) {
+ cleanup_srcu_struct(&kvm->srcu);
+ goto out_err;
+ }
}
- kvm->coalesced_mmio_ring =
- (struct kvm_coalesced_mmio_ring *)page_address(page);
-#endif
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
- {
- int err;
- kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
- err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
- if (err) {
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- put_page(page);
-#endif
- kfree(kvm);
- return ERR_PTR(err);
- }
+ r = kvm_init_mmu_notifier(kvm);
+ if (r) {
+ cleanup_srcu_struct(&kvm->srcu);
+ goto out_err;
}
-#endif
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
spin_lock_init(&kvm->mmu_lock);
- spin_lock_init(&kvm->requests_lock);
- kvm_io_bus_init(&kvm->pio_bus);
+ raw_spin_lock_init(&kvm->requests_lock);
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
- kvm_io_bus_init(&kvm->mmio_bus);
- init_rwsem(&kvm->slots_lock);
+ mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
@@ -991,6 +423,15 @@ static struct kvm *kvm_create_vm(void)
#endif
out:
return kvm;
+
+out_err:
+ hardware_disable_all();
+out_err_nodisable:
+ for (i = 0; i < KVM_NR_BUSES; i++)
+ kfree(kvm->buses[i]);
+ kfree(kvm->memslots);
+ kfree(kvm);
+ return ERR_PTR(r);
}
/*
@@ -1023,13 +464,17 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
void kvm_free_physmem(struct kvm *kvm)
{
int i;
+ struct kvm_memslots *slots = kvm->memslots;
+
+ for (i = 0; i < slots->nmemslots; ++i)
+ kvm_free_physmem_slot(&slots->memslots[i], NULL);
- for (i = 0; i < kvm->nmemslots; ++i)
- kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+ kfree(kvm->memslots);
}
static void kvm_destroy_vm(struct kvm *kvm)
{
+ int i;
struct mm_struct *mm = kvm->mm;
kvm_arch_sync_events(kvm);
@@ -1037,18 +482,16 @@ static void kvm_destroy_vm(struct kvm *kvm)
list_del(&kvm->vm_list);
spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm);
- kvm_io_bus_destroy(&kvm->pio_bus);
- kvm_io_bus_destroy(&kvm->mmio_bus);
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
- if (kvm->coalesced_mmio_ring != NULL)
- free_page((unsigned long)kvm->coalesced_mmio_ring);
-#endif
+ for (i = 0; i < KVM_NR_BUSES; i++)
+ kvm_io_bus_destroy(kvm->buses[i]);
+ kvm_coalesced_mmio_free(kvm);
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
kvm_arch_flush_shadow(kvm);
#endif
kvm_arch_destroy_vm(kvm);
+ hardware_disable_all();
mmdrop(mm);
}
@@ -1088,12 +531,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- int r;
+ int r, flush_shadow = 0;
gfn_t base_gfn;
unsigned long npages;
unsigned long i;
struct kvm_memory_slot *memslot;
struct kvm_memory_slot old, new;
+ struct kvm_memslots *slots, *old_memslots;
r = -EINVAL;
/* General sanity checks */
@@ -1108,7 +552,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
goto out;
- memslot = &kvm->memslots[mem->slot];
+ memslot = &kvm->memslots->memslots[mem->slot];
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
@@ -1129,7 +573,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* Check for overlaps */
r = -EEXIST;
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *s = &kvm->memslots[i];
+ struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
if (s == memslot || !s->npages)
continue;
@@ -1155,15 +599,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
memset(new.rmap, 0, npages * sizeof(*new.rmap));
new.user_alloc = user_alloc;
- /*
- * hva_to_rmmap() serialzies with the mmu_lock and to be
- * safe it has to ignore memslots with !user_alloc &&
- * !userspace_addr.
- */
- if (user_alloc)
- new.userspace_addr = mem->userspace_addr;
- else
- new.userspace_addr = 0;
+ new.userspace_addr = mem->userspace_addr;
}
if (!npages)
goto skip_lpage;
@@ -1218,8 +654,9 @@ skip_lpage:
if (!new.dirty_bitmap)
goto out_free;
memset(new.dirty_bitmap, 0, dirty_bytes);
+ /* destroy any largepage mappings for dirty tracking */
if (old.npages)
- kvm_arch_flush_shadow(kvm);
+ flush_shadow = 1;
}
#else /* not defined CONFIG_S390 */
new.user_alloc = user_alloc;
@@ -1227,36 +664,72 @@ skip_lpage:
new.userspace_addr = mem->userspace_addr;
#endif /* not defined CONFIG_S390 */
- if (!npages)
+ if (!npages) {
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ if (mem->slot >= slots->nmemslots)
+ slots->nmemslots = mem->slot + 1;
+ slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+
+ old_memslots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ synchronize_srcu_expedited(&kvm->srcu);
+ /* From this point no new shadow pages pointing to a deleted
+ * memslot will be created.
+ *
+ * validation of sp->gfn happens in:
+ * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+ * - kvm_is_visible_gfn (mmu_check_roots)
+ */
kvm_arch_flush_shadow(kvm);
+ kfree(old_memslots);
+ }
- spin_lock(&kvm->mmu_lock);
- if (mem->slot >= kvm->nmemslots)
- kvm->nmemslots = mem->slot + 1;
-
- *memslot = new;
- spin_unlock(&kvm->mmu_lock);
-
- r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
- if (r) {
- spin_lock(&kvm->mmu_lock);
- *memslot = old;
- spin_unlock(&kvm->mmu_lock);
+ r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
+ if (r)
goto out_free;
- }
- kvm_free_physmem_slot(&old, npages ? &new : NULL);
- /* Slot deletion case: we have to update the current slot */
- spin_lock(&kvm->mmu_lock);
- if (!npages)
- *memslot = old;
- spin_unlock(&kvm->mmu_lock);
#ifdef CONFIG_DMAR
/* map the pages in iommu page table */
- r = kvm_iommu_map_pages(kvm, base_gfn, npages);
- if (r)
- goto out;
+ if (npages) {
+ r = kvm_iommu_map_pages(kvm, &new);
+ if (r)
+ goto out_free;
+ }
#endif
+
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ if (mem->slot >= slots->nmemslots)
+ slots->nmemslots = mem->slot + 1;
+
+ /* actual memory is freed via old in kvm_free_physmem_slot below */
+ if (!npages) {
+ new.rmap = NULL;
+ new.dirty_bitmap = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
+ new.lpage_info[i] = NULL;
+ }
+
+ slots->memslots[mem->slot] = new;
+ old_memslots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
+
+ kvm_free_physmem_slot(&old, &new);
+ kfree(old_memslots);
+
+ if (flush_shadow)
+ kvm_arch_flush_shadow(kvm);
+
return 0;
out_free:
@@ -1273,9 +746,9 @@ int kvm_set_memory_region(struct kvm *kvm,
{
int r;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
r = __kvm_set_memory_region(kvm, mem, user_alloc);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
@@ -1302,7 +775,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_MEMORY_SLOTS)
goto out;
- memslot = &kvm->memslots[log->slot];
+ memslot = &kvm->memslots->memslots[log->slot];
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
@@ -1356,9 +829,10 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva);
struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
{
int i;
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
- for (i = 0; i < kvm->nmemslots; ++i) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ for (i = 0; i < slots->nmemslots; ++i) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages)
@@ -1377,10 +851,14 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
- gfn = unalias_gfn(kvm, gfn);
+ gfn = unalias_gfn_instantiation(kvm, gfn);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
+
+ if (memslot->flags & KVM_MEMSLOT_INVALID)
+ continue;
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages)
@@ -1390,33 +868,68 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr, size;
+
+ size = PAGE_SIZE;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return PAGE_SIZE;
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ goto out;
+
+ size = vma_kernel_pagesize(vma);
+
+out:
+ up_read(&current->mm->mmap_sem);
+
+ return size;
+}
+
+int memslot_id(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+ struct kvm_memory_slot *memslot = NULL;
+
+ gfn = unalias_gfn(kvm, gfn);
+ for (i = 0; i < slots->nmemslots; ++i) {
+ memslot = &slots->memslots[i];
+
+ if (gfn >= memslot->base_gfn
+ && gfn < memslot->base_gfn + memslot->npages)
+ break;
+ }
+
+ return memslot - slots->memslots;
+}
+
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- gfn = unalias_gfn(kvm, gfn);
+ gfn = unalias_gfn_instantiation(kvm, gfn);
slot = gfn_to_memslot_unaliased(kvm, gfn);
- if (!slot)
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
{
struct page *page[1];
- unsigned long addr;
int npages;
pfn_t pfn;
might_sleep();
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
-
npages = get_user_pages_fast(addr, 1, 1, page);
if (unlikely(npages != 1)) {
@@ -1441,8 +954,32 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
return pfn;
}
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr)) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+ return hva_to_pfn(kvm, addr);
+}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
+static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
+}
+
+pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ unsigned long addr = gfn_to_hva_memslot(slot, gfn);
+ return hva_to_pfn(kvm, addr);
+}
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
@@ -1651,8 +1188,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
unsigned long rel_gfn = gfn - memslot->base_gfn;
/* avoid RMW */
- if (!test_bit(rel_gfn, memslot->dirty_bitmap))
- set_bit(rel_gfn, memslot->dirty_bitmap);
+ if (!generic_test_le_bit(rel_gfn, memslot->dirty_bitmap))
+ generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -1675,9 +1212,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
if (signal_pending(current))
break;
- vcpu_put(vcpu);
schedule();
- vcpu_load(vcpu);
}
finish_wait(&vcpu->wq, &wait);
@@ -1691,6 +1226,21 @@ void kvm_resched(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_resched);
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+{
+ ktime_t expires;
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+ /* Sleep for 100 us, and hope lock-holder got scheduled */
+ expires = ktime_add_ns(ktime_get(), 100000UL);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+
+ finish_wait(&vcpu->wq, &wait);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
+
static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct kvm_vcpu *vcpu = vma->vm_file->private_data;
@@ -1743,7 +1293,7 @@ static struct file_operations kvm_vcpu_fops = {
*/
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
{
- return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
+ return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
}
/*
@@ -1814,88 +1364,6 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
return 0;
}
-#ifdef __KVM_HAVE_MSIX
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
- struct kvm_assigned_msix_nr *entry_nr)
-{
- int r = 0;
- struct kvm_assigned_dev_kernel *adev;
-
- mutex_lock(&kvm->lock);
-
- adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- entry_nr->assigned_dev_id);
- if (!adev) {
- r = -EINVAL;
- goto msix_nr_out;
- }
-
- if (adev->entries_nr == 0) {
- adev->entries_nr = entry_nr->entry_nr;
- if (adev->entries_nr == 0 ||
- adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
- r = -EINVAL;
- goto msix_nr_out;
- }
-
- adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
- entry_nr->entry_nr,
- GFP_KERNEL);
- if (!adev->host_msix_entries) {
- r = -ENOMEM;
- goto msix_nr_out;
- }
- adev->guest_msix_entries = kzalloc(
- sizeof(struct kvm_guest_msix_entry) *
- entry_nr->entry_nr, GFP_KERNEL);
- if (!adev->guest_msix_entries) {
- kfree(adev->host_msix_entries);
- r = -ENOMEM;
- goto msix_nr_out;
- }
- } else /* Not allowed set MSI-X number twice */
- r = -EINVAL;
-msix_nr_out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
- struct kvm_assigned_msix_entry *entry)
-{
- int r = 0, i;
- struct kvm_assigned_dev_kernel *adev;
-
- mutex_lock(&kvm->lock);
-
- adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
- entry->assigned_dev_id);
-
- if (!adev) {
- r = -EINVAL;
- goto msix_entry_out;
- }
-
- for (i = 0; i < adev->entries_nr; i++)
- if (adev->guest_msix_entries[i].vector == 0 ||
- adev->guest_msix_entries[i].entry == entry->entry) {
- adev->guest_msix_entries[i].entry = entry->entry;
- adev->guest_msix_entries[i].vector = entry->gsi;
- adev->host_msix_entries[i].entry = entry->entry;
- break;
- }
- if (i == adev->entries_nr) {
- r = -ENOSPC;
- goto msix_entry_out;
- }
-
-msix_entry_out:
- mutex_unlock(&kvm->lock);
-
- return r;
-}
-#endif
-
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2154,112 +1622,6 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
- case KVM_ASSIGN_PCI_DEVICE: {
- struct kvm_assigned_pci_dev assigned_dev;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
- goto out;
- r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
- if (r)
- goto out;
- break;
- }
- case KVM_ASSIGN_IRQ: {
- r = -EOPNOTSUPP;
- break;
- }
-#ifdef KVM_CAP_ASSIGN_DEV_IRQ
- case KVM_ASSIGN_DEV_IRQ: {
- struct kvm_assigned_irq assigned_irq;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
- goto out;
- r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
- if (r)
- goto out;
- break;
- }
- case KVM_DEASSIGN_DEV_IRQ: {
- struct kvm_assigned_irq assigned_irq;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
- goto out;
- r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
- if (r)
- goto out;
- break;
- }
-#endif
-#endif
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
- case KVM_DEASSIGN_PCI_DEVICE: {
- struct kvm_assigned_pci_dev assigned_dev;
-
- r = -EFAULT;
- if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
- goto out;
- r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
- if (r)
- goto out;
- break;
- }
-#endif
-#ifdef KVM_CAP_IRQ_ROUTING
- case KVM_SET_GSI_ROUTING: {
- struct kvm_irq_routing routing;
- struct kvm_irq_routing __user *urouting;
- struct kvm_irq_routing_entry *entries;
-
- r = -EFAULT;
- if (copy_from_user(&routing, argp, sizeof(routing)))
- goto out;
- r = -EINVAL;
- if (routing.nr >= KVM_MAX_IRQ_ROUTES)
- goto out;
- if (routing.flags)
- goto out;
- r = -ENOMEM;
- entries = vmalloc(routing.nr * sizeof(*entries));
- if (!entries)
- goto out;
- r = -EFAULT;
- urouting = argp;
- if (copy_from_user(entries, urouting->entries,
- routing.nr * sizeof(*entries)))
- goto out_free_irq_routing;
- r = kvm_set_irq_routing(kvm, entries, routing.nr,
- routing.flags);
- out_free_irq_routing:
- vfree(entries);
- break;
- }
-#endif /* KVM_CAP_IRQ_ROUTING */
-#ifdef __KVM_HAVE_MSIX
- case KVM_ASSIGN_SET_MSIX_NR: {
- struct kvm_assigned_msix_nr entry_nr;
- r = -EFAULT;
- if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
- goto out;
- r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
- if (r)
- goto out;
- break;
- }
- case KVM_ASSIGN_SET_MSIX_ENTRY: {
- struct kvm_assigned_msix_entry entry;
- r = -EFAULT;
- if (copy_from_user(&entry, argp, sizeof entry))
- goto out;
- r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
- if (r)
- goto out;
- break;
- }
-#endif
case KVM_IRQFD: {
struct kvm_irqfd data;
@@ -2291,11 +1653,59 @@ static long kvm_vm_ioctl(struct file *filp,
#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+ if (r == -ENOTTY)
+ r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
}
out:
return r;
}
+#ifdef CONFIG_COMPAT
+struct compat_kvm_dirty_log {
+ __u32 slot;
+ __u32 padding1;
+ union {
+ compat_uptr_t dirty_bitmap; /* one bit per page */
+ __u64 padding2;
+ };
+};
+
+static long kvm_vm_compat_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ int r;
+
+ if (kvm->mm != current->mm)
+ return -EIO;
+ switch (ioctl) {
+ case KVM_GET_DIRTY_LOG: {
+ struct compat_kvm_dirty_log compat_log;
+ struct kvm_dirty_log log;
+
+ r = -EFAULT;
+ if (copy_from_user(&compat_log, (void __user *)arg,
+ sizeof(compat_log)))
+ goto out;
+ log.slot = compat_log.slot;
+ log.padding1 = compat_log.padding1;
+ log.padding2 = compat_log.padding2;
+ log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+
+ r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+ if (r)
+ goto out;
+ break;
+ }
+ default:
+ r = kvm_vm_ioctl(filp, ioctl, arg);
+ }
+
+out:
+ return r;
+}
+#endif
+
static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page[1];
@@ -2330,7 +1740,9 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
- .compat_ioctl = kvm_vm_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = kvm_vm_compat_ioctl,
+#endif
.mmap = kvm_vm_mmap,
};
@@ -2342,7 +1754,7 @@ static int kvm_dev_ioctl_create_vm(void)
kvm = kvm_create_vm();
if (IS_ERR(kvm))
return PTR_ERR(kvm);
- fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
+ fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
if (fd < 0)
kvm_put_kvm(kvm);
@@ -2358,6 +1770,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
#ifdef CONFIG_KVM_APIC_ARCHITECTURE
case KVM_CAP_SET_BOOT_CPU_ID:
#endif
+ case KVM_CAP_INTERNAL_ERROR_DATA:
return 1;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
case KVM_CAP_IRQ_ROUTING:
@@ -2428,11 +1841,21 @@ static struct miscdevice kvm_dev = {
static void hardware_enable(void *junk)
{
int cpu = raw_smp_processor_id();
+ int r;
if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
return;
+
cpumask_set_cpu(cpu, cpus_hardware_enabled);
- kvm_arch_hardware_enable(NULL);
+
+ r = kvm_arch_hardware_enable(NULL);
+
+ if (r) {
+ cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+ atomic_inc(&hardware_enable_failed);
+ printk(KERN_INFO "kvm: enabling virtualization on "
+ "CPU%d failed\n", cpu);
+ }
}
static void hardware_disable(void *junk)
@@ -2445,11 +1868,52 @@ static void hardware_disable(void *junk)
kvm_arch_hardware_disable(NULL);
}
+static void hardware_disable_all_nolock(void)
+{
+ BUG_ON(!kvm_usage_count);
+
+ kvm_usage_count--;
+ if (!kvm_usage_count)
+ on_each_cpu(hardware_disable, NULL, 1);
+}
+
+static void hardware_disable_all(void)
+{
+ spin_lock(&kvm_lock);
+ hardware_disable_all_nolock();
+ spin_unlock(&kvm_lock);
+}
+
+static int hardware_enable_all(void)
+{
+ int r = 0;
+
+ spin_lock(&kvm_lock);
+
+ kvm_usage_count++;
+ if (kvm_usage_count == 1) {
+ atomic_set(&hardware_enable_failed, 0);
+ on_each_cpu(hardware_enable, NULL, 1);
+
+ if (atomic_read(&hardware_enable_failed)) {
+ hardware_disable_all_nolock();
+ r = -EBUSY;
+ }
+ }
+
+ spin_unlock(&kvm_lock);
+
+ return r;
+}
+
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
void *v)
{
int cpu = (long)v;
+ if (!kvm_usage_count)
+ return NOTIFY_OK;
+
val &= ~CPU_TASKS_FROZEN;
switch (val) {
case CPU_DYING:
@@ -2503,12 +1967,7 @@ static struct notifier_block kvm_reboot_notifier = {
.priority = 0,
};
-void kvm_io_bus_init(struct kvm_io_bus *bus)
-{
- memset(bus, 0, sizeof(*bus));
-}
-
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
@@ -2517,13 +1976,15 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
kvm_iodevice_destructor(pos);
}
+ kfree(bus);
}
/* kvm_io_bus_write - called under kvm->slots_lock */
-int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
+int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, const void *val)
{
int i;
+ struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
return 0;
@@ -2531,59 +1992,71 @@ int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
}
/* kvm_io_bus_read - called under kvm->slots_lock */
-int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val)
+int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ int len, void *val)
{
int i;
+ struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+
for (i = 0; i < bus->dev_count; i++)
if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
return 0;
return -EOPNOTSUPP;
}
-int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
+/* Caller must hold slots_lock. */
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
{
- int ret;
+ struct kvm_io_bus *new_bus, *bus;
- down_write(&kvm->slots_lock);
- ret = __kvm_io_bus_register_dev(bus, dev);
- up_write(&kvm->slots_lock);
-
- return ret;
-}
-
-/* An unlocked version. Caller must have write lock on slots_lock. */
-int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
-{
+ bus = kvm->buses[bus_idx];
if (bus->dev_count > NR_IOBUS_DEVS-1)
return -ENOSPC;
- bus->devs[bus->dev_count++] = dev;
+ new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+ if (!new_bus)
+ return -ENOMEM;
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+ new_bus->devs[new_bus->dev_count++] = dev;
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kfree(bus);
return 0;
}
-void kvm_io_bus_unregister_dev(struct kvm *kvm,
- struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
+/* Caller must hold slots_lock. */
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
{
- down_write(&kvm->slots_lock);
- __kvm_io_bus_unregister_dev(bus, dev);
- up_write(&kvm->slots_lock);
-}
+ int i, r;
+ struct kvm_io_bus *new_bus, *bus;
-/* An unlocked version. Caller must have write lock on slots_lock. */
-void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
- struct kvm_io_device *dev)
-{
- int i;
+ new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+ if (!new_bus)
+ return -ENOMEM;
- for (i = 0; i < bus->dev_count; i++)
- if (bus->devs[i] == dev) {
- bus->devs[i] = bus->devs[--bus->dev_count];
+ bus = kvm->buses[bus_idx];
+ memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
+
+ r = -ENOENT;
+ for (i = 0; i < new_bus->dev_count; i++)
+ if (new_bus->devs[i] == dev) {
+ r = 0;
+ new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
break;
}
+
+ if (r) {
+ kfree(new_bus);
+ return r;
+ }
+
+ rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kfree(bus);
+ return r;
}
static struct notifier_block kvm_cpu_notifier = {
@@ -2625,7 +2098,7 @@ static int vcpu_stat_get(void *_offset, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
-static struct file_operations *stat_fops[] = {
+static const struct file_operations *stat_fops[] = {
[KVM_STAT_VCPU] = &vcpu_stat_fops,
[KVM_STAT_VM] = &vm_stat_fops,
};
@@ -2652,13 +2125,15 @@ static void kvm_exit_debug(void)
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{
- hardware_disable(NULL);
+ if (kvm_usage_count)
+ hardware_disable(NULL);
return 0;
}
static int kvm_resume(struct sys_device *dev)
{
- hardware_enable(NULL);
+ if (kvm_usage_count)
+ hardware_enable(NULL);
return 0;
}
@@ -2703,8 +2178,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
int r;
int cpu;
- kvm_init_debug();
-
r = kvm_arch_init(opaque);
if (r)
goto out_fail;
@@ -2735,7 +2208,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
goto out_free_1;
}
- on_each_cpu(hardware_enable, NULL, 1);
r = register_cpu_notifier(&kvm_cpu_notifier);
if (r)
goto out_free_2;
@@ -2771,6 +2243,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
+ kvm_init_debug();
+
return 0;
out_free:
@@ -2783,7 +2257,6 @@ out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
out_free_2:
- on_each_cpu(hardware_disable, NULL, 1);
out_free_1:
kvm_arch_hardware_unsetup();
out_free_0a:
@@ -2793,7 +2266,6 @@ out_free_0:
out:
kvm_arch_exit();
out_fail:
- kvm_exit_debug();
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
@@ -2801,6 +2273,7 @@ EXPORT_SYMBOL_GPL(kvm_init);
void kvm_exit(void)
{
tracepoint_synchronize_unregister();
+ kvm_exit_debug();
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
sysdev_unregister(&kvm_sysdev);
@@ -2810,7 +2283,6 @@ void kvm_exit(void)
on_each_cpu(hardware_disable, NULL, 1);
kvm_arch_hardware_unsetup();
kvm_arch_exit();
- kvm_exit_debug();
free_cpumask_var(cpus_hardware_enabled);
__free_page(bad_page);
}