From db0d4db22a78d31c59087f7057b8f1612fecc35d Mon Sep 17 00:00:00 2001
From: Marc Zyngier
Date: Sat, 12 Nov 2011 16:09:49 +0000
Subject: ARM: gic: allow GIC to support non-banked setups

The GIC support code is heavily using the fact that hardware
implementations are exposing banked registers. Unfortunately, it
looks like at least one GIC implementation (EXYNOS) offers both
the distributor and the CPU interfaces at different addresses,
depending on the CPU.

This problem is solved by allowing the distributor and CPU interface
addresses to be per-cpu variables for the platforms that require it.
The EXYNOS code is updated not to mess with the GIC internals while
handling interrupts, and struct gic_chip_data is back to being private.
The DT binding for the gic is updated to allow an optional "cpu-offset"
value, which is used to compute the various base addresses.

Finally, a new config option (GIC_NON_BANKED) is used to control this
feature, so the overhead is only present on kernels compiled with
support for EXYNOS.

Tested on Origen (EXYNOS4) and Panda (OMAP4).

Cc: Kukjin Kim <kgene.kim@samsung.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Abraham <thomas.abraham@linaro.org>
Acked-by: Rob Herring <rob.herring@calxeda.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/devicetree/bindings/arm/gic.txt | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/devicetree/bindings/arm/gic.txt b/Documentation/devicetree/bindings/arm/gic.txt
index 52916b4aa1fe..9b4b82a721b6 100644
--- a/Documentation/devicetree/bindings/arm/gic.txt
+++ b/Documentation/devicetree/bindings/arm/gic.txt
@@ -42,6 +42,10 @@ Optional
 - interrupts	: Interrupt source of the parent interrupt controller. Only
   present on secondary GICs.
 
+- cpu-offset	: per-cpu offset within the distributor and cpu interface
+  regions, used when the GIC doesn't have banked registers. The offset is
+  cpu-offset * cpu-nr.
+
 Example:
 
 	intc: interrupt-controller@fff11000 {
-- 
cgit v1.2.3-55-g7522


From f9b28ccbc7139af656147dcbba9c5425d5706b7d Mon Sep 17 00:00:00 2001
From: Jamie Iles
Date: Tue, 27 Sep 2011 11:00:46 +0100
Subject: ARM: vic: device tree binding

This adds a device tree binding for the VIC based on the of_irq_init()
support.  This adds an irqdomain to the vic and always registers all
vics in the static vic array rather than for pm only to keep track of
the irq domain.  struct irq_data::hwirq is used where appropriate rather
than runtime masking.

v3:	- include linux/export.h for THIS_MODULE
v2:	- use irq_domain_simple_ops
	- remove stub implementation of vic_of_init for !CONFIG_OF
	- Make VIC select IRQ_DOMAIN

Reviewed-by: Rob Herring <robherring2@gmail.com>
Reviewed-by: Grant Likely <grant.likely@secretlab.ca>
Tested-by: Thomas Abraham <thomas.abraham@linaro.org>
Signed-off-by: Jamie Iles <jamie@jamieiles.com>
---
 Documentation/devicetree/bindings/arm/vic.txt |  29 +++++++
 arch/arm/common/Kconfig                       |   1 +
 arch/arm/common/vic.c                         | 107 +++++++++++++++++++-------
 arch/arm/include/asm/hardware/vic.h           |   7 +-
 4 files changed, 115 insertions(+), 29 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/arm/vic.txt

(limited to 'Documentation')

diff --git a/Documentation/devicetree/bindings/arm/vic.txt b/Documentation/devicetree/bindings/arm/vic.txt
new file mode 100644
index 000000000000..266716b23437
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/vic.txt
@@ -0,0 +1,29 @@
+* ARM Vectored Interrupt Controller
+
+One or more Vectored Interrupt Controllers (VIC's) can be connected in an ARM
+system for interrupt routing.  For multiple controllers they can either be
+nested or have the outputs wire-OR'd together.
+
+Required properties:
+
+- compatible : should be one of
+	"arm,pl190-vic"
+	"arm,pl192-vic"
+- interrupt-controller : Identifies the node as an interrupt controller
+- #interrupt-cells : The number of cells to define the interrupts.  Must be 1 as
+  the VIC has no configuration options for interrupt sources.  The cell is a u32
+  and defines the interrupt number.
+- reg : The register bank for the VIC.
+
+Optional properties:
+
+- interrupts : Interrupt source for parent controllers if the VIC is nested.
+
+Example:
+
+	vic0: interrupt-controller@60000 {
+		compatible = "arm,pl192-vic";
+		interrupt-controller;
+		#interrupt-cells = <1>;
+		reg = <0x60000 0x1000>;
+	};
diff --git a/arch/arm/common/Kconfig b/arch/arm/common/Kconfig
index a11cee523cd4..10ca9749e94d 100644
--- a/arch/arm/common/Kconfig
+++ b/arch/arm/common/Kconfig
@@ -7,6 +7,7 @@ config GIC_NON_BANKED
 	bool
 
 config ARM_VIC
+	select IRQ_DOMAIN
 	bool
 
 config ARM_VIC_NR
diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c
index 01f18a421b17..a227a7d53700 100644
--- a/arch/arm/common/vic.c
+++ b/arch/arm/common/vic.c
@@ -19,9 +19,14 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/io.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 #include <linux/syscore_ops.h>
 #include <linux/device.h>
 #include <linux/amba/bus.h>
@@ -29,7 +34,6 @@
 #include <asm/mach/irq.h>
 #include <asm/hardware/vic.h>
 
-#ifdef CONFIG_PM
 /**
  * struct vic_device - VIC PM device
  * @irq: The IRQ number for the base of the VIC.
@@ -40,6 +44,7 @@
  * @int_enable: Save for VIC_INT_ENABLE.
  * @soft_int: Save for VIC_INT_SOFT.
  * @protect: Save for VIC_PROTECT.
+ * @domain: The IRQ domain for the VIC.
  */
 struct vic_device {
 	void __iomem	*base;
@@ -50,13 +55,13 @@ struct vic_device {
 	u32		int_enable;
 	u32		soft_int;
 	u32		protect;
+	struct irq_domain domain;
 };
 
 /* we cannot allocate memory when VICs are initially registered */
 static struct vic_device vic_devices[CONFIG_ARM_VIC_NR];
 
 static int vic_id;
-#endif /* CONFIG_PM */
 
 /**
  * vic_init2 - common initialisation code
@@ -156,39 +161,50 @@ static int __init vic_pm_init(void)
 	return 0;
 }
 late_initcall(vic_pm_init);
+#endif /* CONFIG_PM */
 
 /**
- * vic_pm_register - Register a VIC for later power management control
+ * vic_register() - Register a VIC.
  * @base: The base address of the VIC.
  * @irq: The base IRQ for the VIC.
  * @resume_sources: bitmask of interrupts allowed for resume sources.
+ * @node: The device tree node associated with the VIC.
  *
  * Register the VIC with the system device tree so that it can be notified
  * of suspend and resume requests and ensure that the correct actions are
  * taken to re-instate the settings on resume.
+ *
+ * This also configures the IRQ domain for the VIC.
  */
-static void __init vic_pm_register(void __iomem *base, unsigned int irq, u32 resume_sources)
+static void __init vic_register(void __iomem *base, unsigned int irq,
+				u32 resume_sources, struct device_node *node)
 {
 	struct vic_device *v;
 
-	if (vic_id >= ARRAY_SIZE(vic_devices))
+	if (vic_id >= ARRAY_SIZE(vic_devices)) {
 		printk(KERN_ERR "%s: too few VICs, increase CONFIG_ARM_VIC_NR\n", __func__);
-	else {
-		v = &vic_devices[vic_id];
-		v->base = base;
-		v->resume_sources = resume_sources;
-		v->irq = irq;
-		vic_id++;
+		return;
 	}
+
+	v = &vic_devices[vic_id];
+	v->base = base;
+	v->resume_sources = resume_sources;
+	v->irq = irq;
+	vic_id++;
+
+	v->domain.irq_base = irq;
+	v->domain.nr_irq = 32;
+#ifdef CONFIG_OF_IRQ
+	v->domain.of_node = of_node_get(node);
+	v->domain.ops = &irq_domain_simple_ops;
+#endif /* CONFIG_OF */
+	irq_domain_add(&v->domain);
 }
-#else
-static inline void vic_pm_register(void __iomem *base, unsigned int irq, u32 arg1) { }
-#endif /* CONFIG_PM */
 
 static void vic_ack_irq(struct irq_data *d)
 {
 	void __iomem *base = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->irq & 31;
+	unsigned int irq = d->hwirq;
 	writel(1 << irq, base + VIC_INT_ENABLE_CLEAR);
 	/* moreover, clear the soft-triggered, in case it was the reason */
 	writel(1 << irq, base + VIC_INT_SOFT_CLEAR);
@@ -197,14 +213,14 @@ static void vic_ack_irq(struct irq_data *d)
 static void vic_mask_irq(struct irq_data *d)
 {
 	void __iomem *base = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->irq & 31;
+	unsigned int irq = d->hwirq;
 	writel(1 << irq, base + VIC_INT_ENABLE_CLEAR);
 }
 
 static void vic_unmask_irq(struct irq_data *d)
 {
 	void __iomem *base = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->irq & 31;
+	unsigned int irq = d->hwirq;
 	writel(1 << irq, base + VIC_INT_ENABLE);
 }
 
@@ -226,7 +242,7 @@ static struct vic_device *vic_from_irq(unsigned int irq)
 static int vic_set_wake(struct irq_data *d, unsigned int on)
 {
 	struct vic_device *v = vic_from_irq(d->irq);
-	unsigned int off = d->irq & 31;
+	unsigned int off = d->hwirq;
 	u32 bit = 1 << off;
 
 	if (!v)
@@ -330,15 +346,9 @@ static void __init vic_init_st(void __iomem *base, unsigned int irq_start,
 	vic_set_irq_sources(base, irq_start, vic_sources);
 }
 
-/**
- * vic_init - initialise a vectored interrupt controller
- * @base: iomem base address
- * @irq_start: starting interrupt number, must be muliple of 32
- * @vic_sources: bitmask of interrupt sources to allow
- * @resume_sources: bitmask of interrupt sources to allow for resume
- */
-void __init vic_init(void __iomem *base, unsigned int irq_start,
-		     u32 vic_sources, u32 resume_sources)
+static void __init __vic_init(void __iomem *base, unsigned int irq_start,
+			      u32 vic_sources, u32 resume_sources,
+			      struct device_node *node)
 {
 	unsigned int i;
 	u32 cellid = 0;
@@ -375,5 +385,46 @@ void __init vic_init(void __iomem *base, unsigned int irq_start,
 
 	vic_set_irq_sources(base, irq_start, vic_sources);
 
-	vic_pm_register(base, irq_start, resume_sources);
+	vic_register(base, irq_start, resume_sources, node);
+}
+
+/**
+ * vic_init() - initialise a vectored interrupt controller
+ * @base: iomem base address
+ * @irq_start: starting interrupt number, must be muliple of 32
+ * @vic_sources: bitmask of interrupt sources to allow
+ * @resume_sources: bitmask of interrupt sources to allow for resume
+ */
+void __init vic_init(void __iomem *base, unsigned int irq_start,
+		     u32 vic_sources, u32 resume_sources)
+{
+	__vic_init(base, irq_start, vic_sources, resume_sources, NULL);
+}
+
+#ifdef CONFIG_OF
+int __init vic_of_init(struct device_node *node, struct device_node *parent)
+{
+	void __iomem *regs;
+	int irq_base;
+
+	if (WARN(parent, "non-root VICs are not supported"))
+		return -EINVAL;
+
+	regs = of_iomap(node, 0);
+	if (WARN_ON(!regs))
+		return -EIO;
+
+	irq_base = irq_alloc_descs(-1, 0, 32, numa_node_id());
+	if (WARN_ON(irq_base < 0))
+		goto out_unmap;
+
+	__vic_init(regs, irq_base, ~0, ~0, node);
+
+	return 0;
+
+ out_unmap:
+	iounmap(regs);
+
+	return -EIO;
 }
+#endif /* CONFIG OF */
diff --git a/arch/arm/include/asm/hardware/vic.h b/arch/arm/include/asm/hardware/vic.h
index 5d72550a8097..b348a545de23 100644
--- a/arch/arm/include/asm/hardware/vic.h
+++ b/arch/arm/include/asm/hardware/vic.h
@@ -41,7 +41,12 @@
 #define VIC_PL192_VECT_ADDR		0xF00
 
 #ifndef __ASSEMBLY__
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+struct device_node;
 void vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources, u32 resume_sources);
-#endif
+int vic_of_init(struct device_node *node, struct device_node *parent);
 
+#endif /* __ASSEMBLY__ */
 #endif
-- 
cgit v1.2.3-55-g7522


From 0536bdf33faff4d940ac094c77998cfac368cfff Mon Sep 17 00:00:00 2001
From: Nicolas Pitre
Date: Thu, 25 Aug 2011 00:35:59 -0400
Subject: ARM: move iotable mappings within the vmalloc region

In order to remove the build time variation between different SOCs with
regards to VMALLOC_END, the iotable mappings are now allocated inside
the vmalloc region.  This allows for VMALLOC_END to be identical across
all machines.

The value for VMALLOC_END is now set to 0xff000000 which is right where
the consistent DMA area starts.

To accommodate all static mappings on machines with possible highmem usage,
the default vmalloc area size is changed to 240 MB so that VMALLOC_START
is no higher than 0xf0000000 by default.

Signed-off-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Tested-by: Stephen Warren <swarren@nvidia.com>
Tested-by: Kevin Hilman <khilman@ti.com>
Tested-by: Jamie Iles <jamie@jamieiles.com>
---
 Documentation/arm/memory.txt   | 11 +++++-----
 arch/arm/include/asm/pgtable.h |  8 +------
 arch/arm/mm/mmu.c              | 49 ++++++++++++++++++++++++++++++------------
 3 files changed, 41 insertions(+), 27 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
index 771d48d3b335..208a2d465b92 100644
--- a/Documentation/arm/memory.txt
+++ b/Documentation/arm/memory.txt
@@ -51,15 +51,14 @@ ffc00000	ffefffff	DMA memory mapping region.  Memory returned
 ff000000	ffbfffff	Reserved for future expansion of DMA
 				mapping region.
 
-VMALLOC_END	feffffff	Free for platform use, recommended.
-				VMALLOC_END must be aligned to a 2MB
-				boundary.
-
 VMALLOC_START	VMALLOC_END-1	vmalloc() / ioremap() space.
 				Memory returned by vmalloc/ioremap will
 				be dynamically placed in this region.
-				VMALLOC_START may be based upon the value
-				of the high_memory variable.
+				Machine specific static mappings are also
+				located here through iotable_init().
+				VMALLOC_START is based upon the value
+				of the high_memory variable, and VMALLOC_END
+				is equal to 0xff000000.
 
 PAGE_OFFSET	high_memory-1	Kernel direct-mapped RAM region.
 				This maps the platforms RAM, and typically
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 9451dce3a553..6cdd55cb0b8c 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -21,7 +21,6 @@
 #else
 
 #include <asm/memory.h>
-#include <mach/vmalloc.h>
 #include <asm/pgtable-hwdef.h>
 
 #include <asm/pgtable-2level.h>
@@ -33,15 +32,10 @@
  * any out-of-bounds memory accesses will hopefully be caught.
  * The vmalloc() routines leaves a hole of 4kB between each vmalloced
  * area for the same reason. ;)
- *
- * Note that platforms may override VMALLOC_START, but they must provide
- * VMALLOC_END.  VMALLOC_END defines the (exclusive) limit of this space,
- * which may not overlap IO space.
  */
-#ifndef VMALLOC_START
 #define VMALLOC_OFFSET		(8*1024*1024)
 #define VMALLOC_START		(((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
-#endif
+#define VMALLOC_END		0xff000000UL
 
 #define LIBRARY_TEXT_START	0x0c000000
 
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 0aa8b7d5b21d..c61481577ae1 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -15,6 +15,7 @@
 #include <linux/nodemask.h>
 #include <linux/memblock.h>
 #include <linux/fs.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cputype.h>
 #include <asm/sections.h>
@@ -529,13 +530,18 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 
 #define vectors_base()	(vectors_high() ? 0xffff0000 : 0)
 
-static void __init *early_alloc(unsigned long sz)
+static void __init *early_alloc_aligned(unsigned long sz, unsigned long align)
 {
-	void *ptr = __va(memblock_alloc(sz, sz));
+	void *ptr = __va(memblock_alloc(sz, align));
 	memset(ptr, 0, sz);
 	return ptr;
 }
 
+static void __init *early_alloc(unsigned long sz)
+{
+	return early_alloc_aligned(sz, sz);
+}
+
 static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
 {
 	if (pmd_none(*pmd)) {
@@ -685,9 +691,10 @@ static void __init create_mapping(struct map_desc *md)
 	}
 
 	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
-	    md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) {
+	    md->virtual >= PAGE_OFFSET &&
+	    (md->virtual < VMALLOC_START || md->virtual >= VMALLOC_END)) {
 		printk(KERN_WARNING "BUG: mapping for 0x%08llx"
-		       " at 0x%08lx overlaps vmalloc space\n",
+		       " at 0x%08lx out of vmalloc space\n",
 		       (long long)__pfn_to_phys((u64)md->pfn), md->virtual);
 	}
 
@@ -729,18 +736,32 @@ static void __init create_mapping(struct map_desc *md)
  */
 void __init iotable_init(struct map_desc *io_desc, int nr)
 {
-	int i;
+	struct map_desc *md;
+	struct vm_struct *vm;
+
+	if (!nr)
+		return;
 
-	for (i = 0; i < nr; i++)
-		create_mapping(io_desc + i);
+	vm = early_alloc_aligned(sizeof(*vm) * nr, __alignof__(*vm));
+
+	for (md = io_desc; nr; md++, nr--) {
+		create_mapping(md);
+		vm->addr = (void *)(md->virtual & PAGE_MASK);
+		vm->size = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
+		vm->phys_addr = __pfn_to_phys(md->pfn); 
+		vm->flags = VM_IOREMAP;
+		vm->caller = iotable_init;
+		vm_area_add_early(vm++);
+	}
 }
 
-static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M);
+static void * __initdata vmalloc_min =
+	(void *)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET);
 
 /*
  * vmalloc=size forces the vmalloc area to be exactly 'size'
  * bytes. This can be used to increase (or decrease) the vmalloc
- * area - the default is 128m.
+ * area - the default is 240m.
  */
 static int __init early_vmalloc(char *arg)
 {
@@ -891,10 +912,10 @@ static inline void prepare_page_table(void)
 
 	/*
 	 * Clear out all the kernel space mappings, except for the first
-	 * memory bank, up to the end of the vmalloc region.
+	 * memory bank, up to the vmalloc region.
 	 */
 	for (addr = __phys_to_virt(end);
-	     addr < VMALLOC_END; addr += PMD_SIZE)
+	     addr < VMALLOC_START; addr += PMD_SIZE)
 		pmd_clear(pmd_off_k(addr));
 }
 
@@ -921,8 +942,8 @@ void __init arm_mm_memblock_reserve(void)
 }
 
 /*
- * Set up device the mappings.  Since we clear out the page tables for all
- * mappings above VMALLOC_END, we will remove any debug device mappings.
+ * Set up the device mappings.  Since we clear out the page tables for all
+ * mappings above VMALLOC_START, we will remove any debug device mappings.
  * This means you have to be careful how you debug this function, or any
  * called function.  This means you can't use any function or debugging
  * method which may touch any device, otherwise the kernel _will_ crash.
@@ -937,7 +958,7 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
 	 */
 	vectors_page = early_alloc(PAGE_SIZE);
 
-	for (addr = VMALLOC_END; addr; addr += PMD_SIZE)
+	for (addr = VMALLOC_START; addr; addr += PMD_SIZE)
 		pmd_clear(pmd_off_k(addr));
 
 	/*
-- 
cgit v1.2.3-55-g7522


From 423873736b78f549fbfa2f715f2e4de7e6c5e1e9 Mon Sep 17 00:00:00 2001
From: Alex Williamson
Date: Tue, 20 Dec 2011 21:59:03 -0700
Subject: KVM: Remove ability to assign a device without iommu support

This option has no users and it exposes a security hole that we
can allow devices to be assigned without iommu protection.  Make
KVM_DEV_ASSIGN_ENABLE_IOMMU a mandatory option.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  3 +++
 virt/kvm/assigned-dev.c           | 18 +++++++++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 7945b0bd35e2..ee2c96b3ba5a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1151,6 +1151,9 @@ following flags are specified:
 /* Depends on KVM_CAP_IOMMU */
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
+The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
+isolation of the device.  Usages not specifying this flag are deprecated.
+
 4.49 KVM_DEASSIGN_PCI_DEVICE
 
 Capability: KVM_CAP_DEVICE_DEASSIGNMENT
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 3ad0925d23a9..a251a28f79c7 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -487,6 +487,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	struct kvm_assigned_dev_kernel *match;
 	struct pci_dev *dev;
 
+	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
+		return -EINVAL;
+
 	mutex_lock(&kvm->lock);
 	idx = srcu_read_lock(&kvm->srcu);
 
@@ -544,16 +547,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
-	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
-		if (!kvm->arch.iommu_domain) {
-			r = kvm_iommu_map_guest(kvm);
-			if (r)
-				goto out_list_del;
-		}
-		r = kvm_assign_device(kvm, match);
+	if (!kvm->arch.iommu_domain) {
+		r = kvm_iommu_map_guest(kvm);
 		if (r)
 			goto out_list_del;
 	}
+	r = kvm_assign_device(kvm, match);
+	if (r)
+		goto out_list_del;
 
 out:
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -593,8 +594,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
 		goto out;
 	}
 
-	if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
-		kvm_deassign_device(kvm, match);
+	kvm_deassign_device(kvm, match);
 
 	kvm_free_assigned_device(kvm, match);
 
-- 
cgit v1.2.3-55-g7522


From 3d27e23b17010c668db311140b17bbbb70c78fb9 Mon Sep 17 00:00:00 2001
From: Alex Williamson
Date: Tue, 20 Dec 2011 21:59:09 -0700
Subject: KVM: Device assignment permission checks

Only allow KVM device assignment to attach to devices which:

 - Are not bridges
 - Have BAR resources (assume others are special devices)
 - The user has permissions to use

Assigning a bridge is a configuration error, it's not supported, and
typically doesn't result in the behavior the user is expecting anyway.
Devices without BAR resources are typically chipset components that
also don't have host drivers.  We don't want users to hold such devices
captive or cause system problems by fencing them off into an iommu
domain.  We determine "permission to use" by testing whether the user
has access to the PCI sysfs resource files.  By default a normal user
will not have access to these files, so it provides a good indication
that an administration agent has granted the user access to the device.

[Yang Bai: add missing #include]
[avi: fix comment style]

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Yang Bai <hamo.by@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  4 +++
 virt/kvm/assigned-dev.c           | 75 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index ee2c96b3ba5a..4df9af4f6132 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1154,6 +1154,10 @@ following flags are specified:
 The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
 isolation of the device.  Usages not specifying this flag are deprecated.
 
+Only PCI header type 0 devices with PCI BAR resources are supported by
+device assignment.  The user requesting this ioctl must have read/write
+access to the PCI sysfs resource files associated with the device.
+
 4.49 KVM_DEASSIGN_PCI_DEVICE
 
 Capability: KVM_CAP_DEVICE_DEASSIGNMENT
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index a251a28f79c7..758e3b36d4cf 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -17,6 +17,8 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
 #include "irq.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
@@ -480,12 +482,73 @@ out:
 	return r;
 }
 
+/*
+ * We want to test whether the caller has been granted permissions to
+ * use this device.  To be able to configure and control the device,
+ * the user needs access to PCI configuration space and BAR resources.
+ * These are accessed through PCI sysfs.  PCI config space is often
+ * passed to the process calling this ioctl via file descriptor, so we
+ * can't rely on access to that file.  We can check for permissions
+ * on each of the BAR resource files, which is a pretty clear
+ * indicator that the user has been granted access to the device.
+ */
+static int probe_sysfs_permissions(struct pci_dev *dev)
+{
+#ifdef CONFIG_SYSFS
+	int i;
+	bool bar_found = false;
+
+	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
+		char *kpath, *syspath;
+		struct path path;
+		struct inode *inode;
+		int r;
+
+		if (!pci_resource_len(dev, i))
+			continue;
+
+		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
+		if (!kpath)
+			return -ENOMEM;
+
+		/* Per sysfs-rules, sysfs is always at /sys */
+		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
+		kfree(kpath);
+		if (!syspath)
+			return -ENOMEM;
+
+		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
+		kfree(syspath);
+		if (r)
+			return r;
+
+		inode = path.dentry->d_inode;
+
+		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
+		path_put(&path);
+		if (r)
+			return r;
+
+		bar_found = true;
+	}
+
+	/* If no resources, probably something special */
+	if (!bar_found)
+		return -EPERM;
+
+	return 0;
+#else
+	return -EINVAL; /* No way to control the device without sysfs */
+#endif
+}
+
 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 				      struct kvm_assigned_pci_dev *assigned_dev)
 {
 	int r = 0, idx;
 	struct kvm_assigned_dev_kernel *match;
 	struct pci_dev *dev;
+	u8 header_type;
 
 	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
 		return -EINVAL;
@@ -516,6 +579,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 		r = -EINVAL;
 		goto out_free;
 	}
+
+	/* Don't allow bridges to be assigned */
+	pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type);
+	if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) {
+		r = -EPERM;
+		goto out_put;
+	}
+
+	r = probe_sysfs_permissions(dev);
+	if (r)
+		goto out_put;
+
 	if (pci_enable_device(dev)) {
 		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
 		r = -EBUSY;
-- 
cgit v1.2.3-55-g7522


From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Wed, 21 Dec 2011 12:28:29 +0100
Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid

Unlike all of the other cpuid bits, the TSC deadline timer bit is set
unconditionally, regardless of what userspace wants.

This is broken in several ways:
 - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC
   deadline timer feature, a guest that uses the feature will break
 - live migration to older host kernels that don't support the TSC deadline
   timer will cause the feature to be pulled from under the guest's feet;
   breaking it
 - guests that are broken wrt the feature will fail.

Fix by not enabling the feature automatically; instead report it to userspace.
Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee
will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not
KVM_GET_SUPPORTED_CPUID.

Fixes the Illumos guest kernel, which uses the TSC deadline timer feature.

[avi: add the KVM_CAP + documentation]

Reported-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Tested-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  9 +++++++++
 arch/x86/kvm/x86.c                | 19 +++++++++----------
 include/linux/kvm.h               |  1 +
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4df9af4f6132..e2a4b5287361 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1100,6 +1100,15 @@ emulate them efficiently. The fields in each entry are defined as follows:
    eax, ebx, ecx, edx: the values returned by the cpuid instruction for
          this function/index combination
 
+The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned
+as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC
+support.  Instead it is reported via
+
+  ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER)
+
+if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the
+feature in userspace, then you can enable the feature for KVM_SET_CPUID2.
+
 4.47 KVM_PPC_GET_PVINFO
 
 Capability: KVM_CAP_PPC_GET_PVINFO
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c38efd7b792e..4c938da2ba00 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 timer_mode_mask;
 
 	best = kvm_find_cpuid_entry(vcpu, 1, 0);
 	if (!best)
@@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 			best->ecx |= bit(X86_FEATURE_OSXSAVE);
 	}
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-		best->function == 0x1) {
-		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
-		timer_mode_mask = 3 << 17;
-	} else
-		timer_mode_mask = 1 << 17;
-
-	if (apic)
-		apic->lapic_timer.timer_mode_mask = timer_mode_mask;
+	if (apic) {
+		if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+			apic->lapic_timer.timer_mode_mask = 3 << 17;
+		else
+			apic->lapic_timer.timer_mode_mask = 1 << 17;
+	}
 }
 
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
+	case KVM_CAP_TSC_DEADLINE_TIMER:
+		r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
+		break;
 	default:
 		r = 0;
 		break;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c3892fc1d538..68e67e50d028 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_S390_GMAP 71
+#define KVM_CAP_TSC_DEADLINE_TIMER 72
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-55-g7522