From 4e0b1ab72b8af961bcaca9ec1475279c1cd9579c Mon Sep 17 00:00:00 2001
From: Fan Zhang
Date: Tue, 29 Nov 2016 07:17:55 +0100
Subject: KVM: s390: gs support for kvm guests

This patch adds guarded storage support for KVM guest. We need to
setup the necessary control blocks, the kvm_run structure for the
new registers, the necessary wrappers for VSIE, as well as the
machine check save areas.
GS is enabled lazily and the register saving and reloading is done in
KVM code.  As this feature adds new content for migration, we provide
a new capability for enablement (KVM_CAP_S390_GS).

Signed-off-by: Fan Zhang <zhangfan@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c248f772ae6..725250858479 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4101,6 +4101,15 @@ to take care of that.
 This capability can be enabled dynamically even if VCPUs were already
 created and are running.
 
+7.9 KVM_CAP_S390_GS
+
+Architectures: s390
+Parameters: none
+Returns: 0 on success; -EINVAL if the machine does not support
+	 guarded storage; -EBUSY if a VCPU has already been created.
+
+Allows use of guarded storage for the KVM guest.
+
 8. Other capabilities.
 ----------------------
 
-- 
cgit v1.2.3-55-g7522


From 955d8dc3ee555e9320fabbeab0969f9cf7660f9d Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:14 +0000
Subject: KVM: MIPS: Implement HYPCALL emulation

Emulate the HYPCALL instruction added in the VZ ASE and used by the MIPS
paravirtualised guest support that is already merged. The new hypcall.c
handles arguments and the return value. No actual hypercalls are yet
supported, but this still allows us to safely step over hypercalls and
set an error code in the return value for forward compatibility.

Non-zero HYPCALL codes are not handled.

We also document the hypercall ABI which asm/kvm_para.h uses.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Andreas Herrmann <andreas.herrmann@caviumnetworks.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/hypercalls.txt |  5 +++
 arch/mips/include/asm/kvm_host.h         |  7 +++++
 arch/mips/include/uapi/asm/inst.h        |  2 +-
 arch/mips/kvm/Makefile                   |  1 +
 arch/mips/kvm/emulate.c                  |  3 ++
 arch/mips/kvm/hypcall.c                  | 53 ++++++++++++++++++++++++++++++++
 arch/mips/kvm/trap_emul.c                |  4 +++
 7 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 arch/mips/kvm/hypcall.c

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index feaaa634f154..a890529c63ed 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -28,6 +28,11 @@ S390:
   property inside the device tree's /hypervisor node.
   For more information refer to Documentation/virtual/kvm/ppc-pv.txt
 
+MIPS:
+  KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall
+  number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and
+  the return value is placed in $2 (v0).
+
 KVM Hypercalls Documentation
 ===========================
 The template for each hypercall is:
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 05e785fc061d..0d308d4f2429 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -229,6 +229,7 @@ enum emulation_result {
 	EMULATE_WAIT,		/* WAIT instruction */
 	EMULATE_PRIV_FAIL,
 	EMULATE_EXCEPT,		/* A guest exception has been generated */
+	EMULATE_HYPERCALL,	/* HYPCALL instruction */
 };
 
 #define mips3_paddr_to_tlbpfn(x) \
@@ -832,6 +833,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
+/* Hypercalls (hypcall.c) */
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+					    union mips_instruction inst);
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu);
+
 /* Dynamic binary translation */
 extern int kvm_mips_trans_cache_index(union mips_instruction inst,
 				      u32 *opc, struct kvm_vcpu *vcpu);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 77429d1622b3..b5e46ae872d3 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -179,7 +179,7 @@ enum cop0_coi_func {
 	tlbr_op	      = 0x01, tlbwi_op	    = 0x02,
 	tlbwr_op      = 0x06, tlbp_op	    = 0x08,
 	rfe_op	      = 0x10, eret_op	    = 0x18,
-	wait_op       = 0x20,
+	wait_op       = 0x20, hypcall_op    = 0x28
 };
 
 /*
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 847429de780d..e56403c8a3f5 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -10,6 +10,7 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
 	    interrupt.o stats.o commpage.o \
 	    dyntrans.o trap_emul.o fpu.o
+kvm-objs += hypcall.o
 kvm-objs += mmu.o
 
 obj-$(CONFIG_KVM)	+= kvm.o
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index d40cfaad4529..637753ea0a00 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1143,6 +1143,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 		case wait_op:
 			er = kvm_mips_emul_wait(vcpu);
 			break;
+		case hypcall_op:
+			er = kvm_mips_emul_hypcall(vcpu, inst);
+			break;
 		}
 	} else {
 		rt = inst.c0r_format.rt;
diff --git a/arch/mips/kvm/hypcall.c b/arch/mips/kvm/hypcall.c
new file mode 100644
index 000000000000..83063435195f
--- /dev/null
+++ b/arch/mips/kvm/hypcall.c
@@ -0,0 +1,53 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Hypercall handling.
+ *
+ * Copyright (C) 2015  Imagination Technologies Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_para.h>
+
+#define MAX_HYPCALL_ARGS	4
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+					    union mips_instruction inst)
+{
+	unsigned int code = (inst.co_format.code >> 5) & 0x3ff;
+
+	kvm_debug("[%#lx] HYPCALL %#03x\n", vcpu->arch.pc, code);
+
+	switch (code) {
+	case 0:
+		return EMULATE_HYPERCALL;
+	default:
+		return EMULATE_FAIL;
+	};
+}
+
+static int kvm_mips_hypercall(struct kvm_vcpu *vcpu, unsigned long num,
+			      const unsigned long *args, unsigned long *hret)
+{
+	/* Report unimplemented hypercall to guest */
+	*hret = -KVM_ENOSYS;
+	return RESUME_GUEST;
+}
+
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu)
+{
+	unsigned long num, args[MAX_HYPCALL_ARGS];
+
+	/* read hypcall number and arguments */
+	num = vcpu->arch.gprs[2];	/* v0 */
+	args[0] = vcpu->arch.gprs[4];	/* a0 */
+	args[1] = vcpu->arch.gprs[5];	/* a1 */
+	args[2] = vcpu->arch.gprs[6];	/* a2 */
+	args[3] = vcpu->arch.gprs[7];	/* a3 */
+
+	return kvm_mips_hypercall(vcpu, num,
+				  args, &vcpu->arch.gprs[2] /* v0 */);
+}
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index b1fa53b252ea..3a854bb9e606 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -82,6 +82,10 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 		ret = RESUME_HOST;
 		break;
 
+	case EMULATE_HYPERCALL:
+		ret = kvm_mips_handle_hypcall(vcpu);
+		break;
+
 	default:
 		BUG();
 	}
-- 
cgit v1.2.3-55-g7522


From a8a3c426772e55ae9c3209f061cb6317268f932c Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:19 +0000
Subject: KVM: MIPS: Add VZ & TE capabilities

Add new KVM_CAP_MIPS_VZ and KVM_CAP_MIPS_TE capabilities, and in order
to allow MIPS KVM to support VZ without confusing old users (which
expect the trap & emulate implementation), define and start checking
KVM_CREATE_VM type codes.

The codes available are:

 - KVM_VM_MIPS_TE = 0

   This is the current value expected from the user, and will create a
   VM using trap & emulate in user mode, confined to the user mode
   address space. This may in future become unavailable if the kernel is
   only configured to support VZ, in which case the EINVAL error will be
   returned and KVM_CAP_MIPS_TE won't be available even though
   KVM_CAP_MIPS_VZ is.

 - KVM_VM_MIPS_VZ = 1

   This can be provided when the KVM_CAP_MIPS_VZ capability is available
   to create a VM using VZ, with a fully virtualized guest virtual
   address space. If VZ support is unavailable in the kernel, the EINVAL
   error will be returned (although old kernels without the
   KVM_CAP_MIPS_VZ capability may well succeed and create a trap &
   emulate VM).

This is designed to allow the desired implementation (T&E vs VZ) to be
potentially chosen at runtime rather than being fixed in the kernel
configuration.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt | 47 ++++++++++++++++++++++++++++++++++++++-
 arch/mips/kvm/mips.c              |  9 ++++++++
 include/uapi/linux/kvm.h          |  6 +++++
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c248f772ae6..4b5fa2571efa 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -115,12 +115,17 @@ will access the virtual machine's physical address space; offset zero
 corresponds to guest physical address zero.  Use of mmap() on a VM fd
 is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
 available.
-You most certainly want to use 0 as machine type.
+You probably want to use 0 as machine type.
 
 In order to create user controlled virtual machines on S390, check
 KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
 privileged user (CAP_SYS_ADMIN).
 
+To use hardware assisted virtualization on MIPS (VZ ASE) rather than
+the default trap & emulate implementation (which changes the virtual
+memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
+flag KVM_VM_MIPS_VZ.
+
 
 4.3 KVM_GET_MSR_INDEX_LIST
 
@@ -4147,3 +4152,43 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is
 available, means that that the kernel can support guests using the
 hashed page table MMU defined in Power ISA V3.00 (as implemented in
 the POWER9 processor), including in-memory segment tables.
+
+8.5 KVM_CAP_MIPS_VZ
+
+Architectures: mips
+
+This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that
+it is available, means that full hardware assisted virtualization capabilities
+of the hardware are available for use through KVM. An appropriate
+KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which
+utilises it.
+
+If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
+available, it means that the VM is using full hardware assisted virtualization
+capabilities of the hardware. This is useful to check after creating a VM with
+KVM_VM_MIPS_DEFAULT.
+
+The value returned by KVM_CHECK_EXTENSION should be compared against known
+values (see below). All other values are reserved. This is to allow for the
+possibility of other hardware assisted virtualization implementations which
+may be incompatible with the MIPS VZ ASE.
+
+ 0: The trap & emulate implementation is in use to run guest code in user
+    mode. Guest virtual memory segments are rearranged to fit the guest in the
+    user mode address space.
+
+ 1: The MIPS VZ ASE is in use, providing full hardware assisted
+    virtualization, including standard guest virtual memory segments.
+
+8.6 KVM_CAP_MIPS_TE
+
+Architectures: mips
+
+This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that
+it is available, means that the trap & emulate implementation is available to
+run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware
+assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed
+to KVM_CREATE_VM to create a VM which utilises it.
+
+If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
+available, it means that the VM is using trap & emulate.
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index c507533ef6ea..476ece99bf3b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -107,6 +107,14 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+	switch (type) {
+	case KVM_VM_MIPS_TE:
+		break;
+	default:
+		/* Unsupported KVM type */
+		return -EINVAL;
+	};
+
 	/* Allocate page table to map GPA -> RPA */
 	kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
 	if (!kvm->arch.gpa_mm.pgd)
@@ -1038,6 +1046,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_READONLY_MEM:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_IMMEDIATE_EXIT:
+	case KVM_CAP_MIPS_TE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f51d5082a377..58ddedce4235 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -702,6 +702,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_VM_PPC_HV 1
 #define KVM_VM_PPC_PR 2
 
+/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
+#define KVM_VM_MIPS_TE		0
+#define KVM_VM_MIPS_VZ		1
+
 #define KVM_S390_SIE_PAGE_OFFSET 1
 
 /*
@@ -883,6 +887,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_MIPS_VZ 137
+#define KVM_CAP_MIPS_TE 138
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-55-g7522


From 578fd61d2d210a3b58dc107f5382b965922ac253 Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:20 +0000
Subject: KVM: MIPS: Add 64BIT capability

Add a new KVM_CAP_MIPS_64BIT capability to indicate that 64-bit MIPS
guests are available and supported. In this case it should still be
possible to run 32-bit guest code. If not available it won't be possible
to run 64-bit guest code and the instructions may not be available, or
the kernel may not support full context switching of 64-bit registers.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt | 25 +++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 2 files changed, 26 insertions(+)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4b5fa2571efa..1b8486c094b4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4192,3 +4192,28 @@ to KVM_CREATE_VM to create a VM which utilises it.
 
 If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
 available, it means that the VM is using trap & emulate.
+
+8.7 KVM_CAP_MIPS_64BIT
+
+Architectures: mips
+
+This capability indicates the supported architecture type of the guest, i.e. the
+supported register and address width.
+
+The values returned when this capability is checked by KVM_CHECK_EXTENSION on a
+kvm VM handle correspond roughly to the CP0_Config.AT register field, and should
+be checked specifically against known values (see below). All other values are
+reserved.
+
+ 0: MIPS32 or microMIPS32.
+    Both registers and addresses are 32-bits wide.
+    It will only be possible to run 32-bit guest code.
+
+ 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments.
+    Registers are 64-bits wide, but addresses are 32-bits wide.
+    64-bit guest code may run but cannot access MIPS64 memory segments.
+    It will also be possible to run 32-bit guest code.
+
+ 2: MIPS64 or microMIPS64 with access to all address segments.
+    Both registers and addresses are 64-bits wide.
+    It will be possible to run 64-bit or 32-bit guest code.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 58ddedce4235..1e1a6c728a18 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -889,6 +889,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_IMMEDIATE_EXIT 136
 #define KVM_CAP_MIPS_VZ 137
 #define KVM_CAP_MIPS_TE 138
+#define KVM_CAP_MIPS_64BIT 139
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-55-g7522


From c992a4f6a9b0a37c8bd7dfc727ecc3fed125c16b Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:31 +0000
Subject: KVM: MIPS: Implement VZ support

Add the main support for the MIPS Virtualization ASE (A.K.A. VZ) to MIPS
KVM. The bulk of this work is in vz.c, with various new state and
definitions elsewhere.

Enough is implemented to be able to run on a minimal VZ core. Further
patches will fill out support for guest features which are optional or
can be disabled.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |    2 +
 arch/mips/include/asm/cpu-info.h  |    1 +
 arch/mips/include/asm/kvm_host.h  |   42 +
 arch/mips/kernel/time.c           |    1 +
 arch/mips/kvm/interrupt.h         |    5 +
 arch/mips/kvm/mips.c              |    5 +
 arch/mips/kvm/mmu.c               |   20 +
 arch/mips/kvm/tlb.c               |    7 +
 arch/mips/kvm/trace.h             |   15 +
 arch/mips/kvm/vz.c                | 2381 +++++++++++++++++++++++++++++++++++++
 10 files changed, 2479 insertions(+)
 create mode 100644 arch/mips/kvm/vz.c

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 1b8486c094b4..d2827864827f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2075,6 +2075,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONTEXT      | 64
   MIPS  | KVM_REG_MIPS_CP0_USERLOCAL    | 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
+  MIPS  | KVM_REG_MIPS_CP0_PAGEGRAIN    | 32
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
@@ -2094,6 +2095,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONFIG4      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG5      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG7      | 32
+  MIPS  | KVM_REG_MIPS_CP0_XCONTEXT     | 64
   MIPS  | KVM_REG_MIPS_CP0_ERROREPC     | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH1    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH2    | 64
diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index 4113796e0ef4..be3b4c25f335 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -110,6 +110,7 @@ struct cpuinfo_mips {
 	struct guest_info	guest;
 	unsigned int		gtoffset_mask;
 	unsigned int		guestid_mask;
+	unsigned int		guestid_cache;
 } __attribute__((aligned(SMP_CACHE_BYTES)));
 
 extern struct cpuinfo_mips cpu_data[];
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b47571850aac..bc3f9dedaac8 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -10,6 +10,7 @@
 #ifndef __MIPS_KVM_HOST_H__
 #define __MIPS_KVM_HOST_H__
 
+#include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <linux/hrtimer.h>
 #include <linux/interrupt.h>
@@ -73,6 +74,11 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
+#ifdef CONFIG_KVM_MIPS_VZ
+extern unsigned long GUESTID_MASK;
+extern unsigned long GUESTID_FIRST_VERSION;
+extern unsigned long GUESTID_VERSION_MASK;
+#endif
 
 
 /*
@@ -167,6 +173,8 @@ struct kvm_arch_memory_slot {
 struct kvm_arch {
 	/* Guest physical mm */
 	struct mm_struct gpa_mm;
+	/* Mask of CPUs needing GPA ASID flush */
+	cpumask_t asid_flush_mask;
 };
 
 #define N_MIPS_COPROC_REGS	32
@@ -224,6 +232,11 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG4_SEL	4
 #define MIPS_CP0_CONFIG5_SEL	5
 
+#define MIPS_CP0_GUESTCTL2	10
+#define MIPS_CP0_GUESTCTL2_SEL	5
+#define MIPS_CP0_GTOFFSET	12
+#define MIPS_CP0_GTOFFSET_SEL	7
+
 /* Resume Flags */
 #define RESUME_FLAG_DR		(1<<0)	/* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST	(1<<1)	/* Resume host? */
@@ -356,7 +369,20 @@ struct kvm_vcpu_arch {
 	/* Cache some mmu pages needed inside spinlock regions */
 	struct kvm_mmu_memory_cache mmu_page_cache;
 
+#ifdef CONFIG_KVM_MIPS_VZ
+	/* vcpu's vzguestid is different on each host cpu in an smp system */
+	u32 vzguestid[NR_CPUS];
+
+	/* wired guest TLB entries */
+	struct kvm_mips_tlb *wired_tlb;
+	unsigned int wired_tlb_limit;
+	unsigned int wired_tlb_used;
+#endif
+
+	/* Last CPU the VCPU state was loaded on */
 	int last_sched_cpu;
+	/* Last CPU the VCPU actually executed guest code on */
+	int last_exec_cpu;
 
 	/* WAIT executed */
 	int wait;
@@ -660,6 +686,7 @@ __BUILD_KVM_RW_HW(config4,        32, MIPS_CP0_CONFIG,       4)
 __BUILD_KVM_RW_HW(config5,        32, MIPS_CP0_CONFIG,       5)
 __BUILD_KVM_RW_HW(config6,        32, MIPS_CP0_CONFIG,       6)
 __BUILD_KVM_RW_HW(config7,        32, MIPS_CP0_CONFIG,       7)
+__BUILD_KVM_RW_HW(xcontext,       l,  MIPS_CP0_TLB_XCONTEXT, 0)
 __BUILD_KVM_RW_HW(errorepc,       l,  MIPS_CP0_ERROR_PC,     0)
 __BUILD_KVM_RW_HW(kscratch1,      l,  MIPS_CP0_DESAVE,       2)
 __BUILD_KVM_RW_HW(kscratch2,      l,  MIPS_CP0_DESAVE,       3)
@@ -674,6 +701,14 @@ __BUILD_KVM_SET_HW(status,        32, MIPS_CP0_STATUS,       0)
 __BUILD_KVM_ATOMIC_HW(cause,      32, MIPS_CP0_CAUSE,        0)
 __BUILD_KVM_SET_HW(ebase,         l,  MIPS_CP0_PRID,         1)
 
+/* Bitwise operations (on saved state) */
+__BUILD_KVM_SET_SAVED(config,     32, MIPS_CP0_CONFIG,       0)
+__BUILD_KVM_SET_SAVED(config1,    32, MIPS_CP0_CONFIG,       1)
+__BUILD_KVM_SET_SAVED(config2,    32, MIPS_CP0_CONFIG,       2)
+__BUILD_KVM_SET_SAVED(config3,    32, MIPS_CP0_CONFIG,       3)
+__BUILD_KVM_SET_SAVED(config4,    32, MIPS_CP0_CONFIG,       4)
+__BUILD_KVM_SET_SAVED(config5,    32, MIPS_CP0_CONFIG,       5)
+
 /* Helpers */
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
@@ -786,6 +821,10 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 
 u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+				      struct kvm_vcpu *vcpu, bool write_fault);
+#endif
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
 					   struct kvm_vcpu *vcpu,
 					   bool write_fault);
@@ -1026,6 +1065,9 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu);
 
+/* COP0 */
+enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu);
+
 unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index a7f81261c781..c036157fb891 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -70,6 +70,7 @@ EXPORT_SYMBOL(perf_irq);
  */
 
 unsigned int mips_hpt_frequency;
+EXPORT_SYMBOL_GPL(mips_hpt_frequency);
 
 /*
  * This function exists in order to cause an error due to a duplicate
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index fb118a2c8379..3bf0a49725e8 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -30,8 +30,13 @@
 
 #define C_TI        (_ULCAST_(1) << 30)
 
+#ifdef CONFIG_KVM_MIPS_VZ
+#define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (1)
+#define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (1)
+#else
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
 #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (0)
+#endif
 
 void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
 void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8c10148a345c..1bb5d01d5dec 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -113,7 +113,11 @@ void kvm_arch_check_processor_compat(void *rtn)
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	switch (type) {
+#ifdef CONFIG_KVM_MIPS_VZ
+	case KVM_VM_MIPS_VZ:
+#else
 	case KVM_VM_MIPS_TE:
+#endif
 		break;
 	default:
 		/* Unsupported KVM type */
@@ -378,6 +382,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	/* Init */
 	vcpu->arch.last_sched_cpu = -1;
+	vcpu->arch.last_exec_cpu = -1;
 
 	return vcpu;
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index cb0faade311e..ee64db032793 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -992,6 +992,22 @@ static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
 	return kvm_mips_gpa_pte_to_gva_unmapped(pte);
 }
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+				      struct kvm_vcpu *vcpu,
+				      bool write_fault)
+{
+	int ret;
+
+	ret = kvm_mips_map_page(vcpu, badvaddr, write_fault, NULL, NULL);
+	if (ret)
+		return ret;
+
+	/* Invalidate this entry in the TLB */
+	return kvm_vz_host_tlb_inv(vcpu, badvaddr);
+}
+#endif
+
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 				    struct kvm_vcpu *vcpu,
@@ -1225,6 +1241,10 @@ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
 {
 	int err;
 
+	if (WARN(IS_ENABLED(CONFIG_KVM_MIPS_VZ),
+		 "Expect BadInstr/BadInstrP registers to be used with VZ\n"))
+		return -EINVAL;
+
 retry:
 	kvm_trap_emul_gva_lockless_begin(vcpu);
 	err = get_user(*out, opc);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index a28fcb1e5072..c215470fdcb0 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -34,6 +34,13 @@
 #define KVM_GUEST_SP_TLB    1
 
 #ifdef CONFIG_KVM_MIPS_VZ
+unsigned long GUESTID_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_MASK);
+unsigned long GUESTID_FIRST_VERSION;
+EXPORT_SYMBOL_GPL(GUESTID_FIRST_VERSION);
+unsigned long GUESTID_VERSION_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_VERSION_MASK);
+
 static u32 kvm_mips_get_root_asid(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *gpa_mm = &vcpu->kvm->arch.gpa_mm;
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index 0c59282a2f7d..d80d37a1b82e 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -286,6 +286,21 @@ TRACE_EVENT(kvm_asid_change,
 		      __entry->new_asid)
 );
 
+TRACE_EVENT(kvm_guestid_change,
+	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int guestid),
+	    TP_ARGS(vcpu, guestid),
+	    TP_STRUCT__entry(
+			__field(unsigned int, guestid)
+	    ),
+
+	    TP_fast_assign(
+			__entry->guestid = guestid;
+	    ),
+
+	    TP_printk("GuestID: 0x%02x",
+		      __entry->guestid)
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
new file mode 100644
index 000000000000..cfed234be1e3
--- /dev/null
+++ b/arch/mips/kvm/vz.c
@@ -0,0 +1,2381 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Support for hardware virtualization extensions
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Yann Le Du <ledu@kymasys.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/cacheops.h>
+#include <asm/cmpxchg.h>
+#include <asm/fpu.h>
+#include <asm/hazards.h>
+#include <asm/inst.h>
+#include <asm/mmu_context.h>
+#include <asm/r4kcache.h>
+#include <asm/time.h>
+#include <asm/tlb.h>
+#include <asm/tlbex.h>
+
+#include <linux/kvm_host.h>
+
+#include "interrupt.h"
+
+#include "trace.h"
+
+/* Pointers to last VCPU loaded on each physical CPU */
+static struct kvm_vcpu *last_vcpu[NR_CPUS];
+/* Pointers to last VCPU executed on each physical CPU */
+static struct kvm_vcpu *last_exec_vcpu[NR_CPUS];
+
+/*
+ * Number of guest VTLB entries to use, so we can catch inconsistency between
+ * CPUs.
+ */
+static unsigned int kvm_vz_guest_vtlb_size;
+
+static inline long kvm_vz_read_gc0_ebase(void)
+{
+	if (sizeof(long) == 8 && cpu_has_ebase_wg)
+		return read_gc0_ebase_64();
+	else
+		return read_gc0_ebase();
+}
+
+static inline void kvm_vz_write_gc0_ebase(long v)
+{
+	/*
+	 * First write with WG=1 to write upper bits, then write again in case
+	 * WG should be left at 0.
+	 * write_gc0_ebase_64() is no longer UNDEFINED since R6.
+	 */
+	if (sizeof(long) == 8 &&
+	    (cpu_has_mips64r6 || cpu_has_ebase_wg)) {
+		write_gc0_ebase_64(v | MIPS_EBASE_WG);
+		write_gc0_ebase_64(v);
+	} else {
+		write_gc0_ebase(v | MIPS_EBASE_WG);
+		write_gc0_ebase(v);
+	}
+}
+
+/*
+ * These Config bits may be writable by the guest:
+ * Config:	[K23, KU] (!TLB), K0
+ * Config1:	(none)
+ * Config2:	[TU, SU] (impl)
+ * Config3:	ISAOnExc
+ * Config4:	FTLBPageSize
+ * Config5:	K, CV, MSAEn, UFE, FRE, SBRI, UFR
+ */
+
+static inline unsigned int kvm_vz_config_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return CONF_CM_CMASK;
+}
+
+static inline unsigned int kvm_vz_config1_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline unsigned int kvm_vz_config2_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline unsigned int kvm_vz_config3_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return MIPS_CONF3_ISA_OE;
+}
+
+static inline unsigned int kvm_vz_config4_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	/* no need to be exact */
+	return MIPS_CONF4_VFTLBPAGESIZE;
+}
+
+static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = MIPS_CONF5_K | MIPS_CONF5_CV | MIPS_CONF5_SBRI;
+
+	/* Permit MSAEn changes if MSA supported and enabled */
+	if (kvm_mips_guest_has_msa(&vcpu->arch))
+		mask |= MIPS_CONF5_MSAEN;
+
+	/*
+	 * Permit guest FPU mode changes if FPU is enabled and the relevant
+	 * feature exists according to FIR register.
+	 */
+	if (kvm_mips_guest_has_fpu(&vcpu->arch)) {
+		if (cpu_has_ufr)
+			mask |= MIPS_CONF5_UFR;
+		if (cpu_has_fre)
+			mask |= MIPS_CONF5_FRE | MIPS_CONF5_UFE;
+	}
+
+	return mask;
+}
+
+/*
+ * VZ optionally allows these additional Config bits to be written by root:
+ * Config:	M, [MT]
+ * Config1:	M, [MMUSize-1, C2, MD, PC, WR, CA], FP
+ * Config2:	M
+ * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP, CTXTC, ITL, LPA, VEIC,
+ *		VInt, SP, CDMM, MT, SM, TL]
+ * Config4:	M, [VTLBSizeExt, MMUSizeExt]
+ * Config5:	[MRP]
+ */
+
+static inline unsigned int kvm_vz_config_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config1_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = kvm_vz_config1_guest_wrmask(vcpu) | MIPS_CONF_M;
+
+	/* Permit FPU to be present if FPU is supported */
+	if (kvm_mips_guest_can_have_fpu(&vcpu->arch))
+		mask |= MIPS_CONF1_FP;
+
+	return mask;
+}
+
+static inline unsigned int kvm_vz_config2_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config2_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config3_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = kvm_vz_config3_guest_wrmask(vcpu) | MIPS_CONF_M |
+		MIPS_CONF3_ULRI;
+
+	/* Permit MSA to be present if MSA is supported */
+	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+		mask |= MIPS_CONF3_MSA;
+
+	return mask;
+}
+
+static inline unsigned int kvm_vz_config4_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config4_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config5_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config5_guest_wrmask(vcpu);
+}
+
+static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
+{
+	/* VZ guest has already converted gva to gpa */
+	return gva;
+}
+
+static void kvm_vz_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	set_bit(priority, &vcpu->arch.pending_exceptions);
+	clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	clear_bit(priority, &vcpu->arch.pending_exceptions);
+	set_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_queue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * timer expiry is asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_dequeue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * timer expiry is asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_queue_io_int_cb(struct kvm_vcpu *vcpu,
+				   struct kvm_mips_interrupt *irq)
+{
+	int intr = (int)irq->irq;
+
+	/*
+	 * interrupts are asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	switch (intr) {
+	case 2:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IO);
+		break;
+
+	case 3:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+		break;
+
+	case 4:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+		break;
+
+	default:
+		break;
+	}
+
+}
+
+static void kvm_vz_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
+				     struct kvm_mips_interrupt *irq)
+{
+	int intr = (int)irq->irq;
+
+	/*
+	 * interrupts are asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	switch (intr) {
+	case -2:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IO);
+		break;
+
+	case -3:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+		break;
+
+	case -4:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+		break;
+
+	default:
+		break;
+	}
+
+}
+
+static u32 kvm_vz_priority_to_irq[MIPS_EXC_MAX] = {
+	[MIPS_EXC_INT_TIMER] = C_IRQ5,
+	[MIPS_EXC_INT_IO]    = C_IRQ0,
+	[MIPS_EXC_INT_IPI_1] = C_IRQ1,
+	[MIPS_EXC_INT_IPI_2] = C_IRQ2,
+};
+
+static int kvm_vz_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+				 u32 cause)
+{
+	u32 irq = (priority < MIPS_EXC_MAX) ?
+		kvm_vz_priority_to_irq[priority] : 0;
+
+	switch (priority) {
+	case MIPS_EXC_INT_TIMER:
+		set_gc0_cause(C_TI);
+		break;
+
+	case MIPS_EXC_INT_IO:
+	case MIPS_EXC_INT_IPI_1:
+	case MIPS_EXC_INT_IPI_2:
+		if (cpu_has_guestctl2)
+			set_c0_guestctl2(irq);
+		else
+			set_gc0_cause(irq);
+		break;
+
+	default:
+		break;
+	}
+
+	clear_bit(priority, &vcpu->arch.pending_exceptions);
+	return 1;
+}
+
+static int kvm_vz_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+			       u32 cause)
+{
+	u32 irq = (priority < MIPS_EXC_MAX) ?
+		kvm_vz_priority_to_irq[priority] : 0;
+
+	switch (priority) {
+	case MIPS_EXC_INT_TIMER:
+		/*
+		 * Call to kvm_write_c0_guest_compare() clears Cause.TI in
+		 * kvm_mips_emulate_CP0(). Explicitly clear irq associated with
+		 * Cause.IP[IPTI] if GuestCtl2 virtual interrupt register not
+		 * supported or if not using GuestCtl2 Hardware Clear.
+		 */
+		if (cpu_has_guestctl2) {
+			if (!(read_c0_guestctl2() & (irq << 14)))
+				clear_c0_guestctl2(irq);
+		} else {
+			clear_gc0_cause(irq);
+		}
+		break;
+
+	case MIPS_EXC_INT_IO:
+	case MIPS_EXC_INT_IPI_1:
+	case MIPS_EXC_INT_IPI_2:
+		/* Clear GuestCtl2.VIP irq if not using Hardware Clear */
+		if (cpu_has_guestctl2) {
+			if (!(read_c0_guestctl2() & (irq << 14)))
+				clear_c0_guestctl2(irq);
+		} else {
+			clear_gc0_cause(irq);
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+	return 1;
+}
+
+/*
+ * VZ guest timer handling.
+ */
+
+/**
+ * _kvm_vz_restore_stimer() - Restore soft timer state.
+ * @vcpu:	Virtual CPU.
+ * @compare:	CP0_Compare register value, restored by caller.
+ * @cause:	CP0_Cause register to restore.
+ *
+ * Restore VZ state relating to the soft timer.
+ */
+static void _kvm_vz_restore_stimer(struct kvm_vcpu *vcpu, u32 compare,
+				   u32 cause)
+{
+	/*
+	 * Avoid spurious counter interrupts by setting Guest CP0_Count to just
+	 * after Guest CP0_Compare.
+	 */
+	write_c0_gtoffset(compare - read_c0_count());
+
+	back_to_back_c0_hazard();
+	write_gc0_cause(cause);
+}
+
+/**
+ * kvm_vz_restore_timer() - Restore guest timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Restore soft timer state from saved context.
+ */
+static void kvm_vz_restore_timer(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	u32 cause, compare;
+
+	compare = kvm_read_sw_gc0_compare(cop0);
+	cause = kvm_read_sw_gc0_cause(cop0);
+
+	write_gc0_compare(compare);
+	_kvm_vz_restore_stimer(vcpu, compare, cause);
+}
+
+/**
+ * kvm_vz_save_timer() - Save guest timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Save VZ guest timer state.
+ */
+static void kvm_vz_save_timer(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	u32 compare, cause;
+
+	compare = read_gc0_compare();
+	cause = read_gc0_cause();
+
+	/* save timer-related state to VCPU context */
+	kvm_write_sw_gc0_cause(cop0, cause);
+	kvm_write_sw_gc0_compare(cop0, compare);
+}
+
+/**
+ * kvm_vz_gva_to_gpa() - Convert valid GVA to GPA.
+ * @vcpu:	KVM VCPU state.
+ * @gva:	Guest virtual address to convert.
+ * @gpa:	Output guest physical address.
+ *
+ * Convert a guest virtual address (GVA) which is valid according to the guest
+ * context, to a guest physical address (GPA).
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+			     unsigned long *gpa)
+{
+	u32 gva32 = gva;
+
+	if ((long)gva == (s32)gva32) {
+		/* Handle canonical 32-bit virtual address */
+		if ((s32)gva32 < (s32)0xc0000000) {
+			/* legacy unmapped KSeg0 or KSeg1 */
+			*gpa = gva32 & 0x1fffffff;
+			return 0;
+		}
+#ifdef CONFIG_64BIT
+	} else if ((gva & 0xc000000000000000) == 0x8000000000000000) {
+		/* XKPHYS */
+		/*
+		 * Traditionally fully unmapped.
+		 * Bits 61:59 specify the CCA, which we can just mask off here.
+		 * Bits 58:PABITS should be zero, but we shouldn't have got here
+		 * if it wasn't.
+		 */
+		*gpa = gva & 0x07ffffffffffffff;
+		return 0;
+#endif
+	}
+
+	return kvm_vz_guest_tlb_lookup(vcpu, gva, gpa);
+}
+
+/**
+ * kvm_vz_badvaddr_to_gpa() - Convert GVA BadVAddr from root exception to GPA.
+ * @vcpu:	KVM VCPU state.
+ * @badvaddr:	Root BadVAddr.
+ * @gpa:	Output guest physical address.
+ *
+ * VZ implementations are permitted to report guest virtual addresses (GVA) in
+ * BadVAddr on a root exception during guest execution, instead of the more
+ * convenient guest physical addresses (GPA). When we get a GVA, this function
+ * converts it to a GPA, taking into account guest segmentation and guest TLB
+ * state.
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+static int kvm_vz_badvaddr_to_gpa(struct kvm_vcpu *vcpu, unsigned long badvaddr,
+				  unsigned long *gpa)
+{
+	unsigned int gexccode = (vcpu->arch.host_cp0_guestctl0 &
+				 MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+
+	/* If BadVAddr is GPA, then all is well in the world */
+	if (likely(gexccode == MIPS_GCTL0_GEXC_GPA)) {
+		*gpa = badvaddr;
+		return 0;
+	}
+
+	/* Otherwise we'd expect it to be GVA ... */
+	if (WARN(gexccode != MIPS_GCTL0_GEXC_GVA,
+		 "Unexpected gexccode %#x\n", gexccode))
+		return -EINVAL;
+
+	/* ... and we need to perform the GVA->GPA translation in software */
+	return kvm_vz_gva_to_gpa(vcpu, badvaddr, gpa);
+}
+
+static int kvm_trap_vz_no_handler(struct kvm_vcpu *vcpu)
+{
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
+	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
+	u32 inst = 0;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n",
+		exccode, opc, inst, badvaddr,
+		read_gc0_status());
+	kvm_arch_vcpu_dump_regs(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	return RESUME_HOST;
+}
+
+static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
+					      u32 *opc, u32 cause,
+					      struct kvm_run *run,
+					      struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	enum emulation_result er = EMULATE_DONE;
+	u32 rt, rd, sel;
+	unsigned long curr_pc;
+	unsigned long val;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	if (inst.co_format.co) {
+		switch (inst.co_format.func) {
+		case wait_op:
+			er = kvm_mips_emul_wait(vcpu);
+			break;
+		default:
+			er = EMULATE_FAIL;
+		}
+	} else {
+		rt = inst.c0r_format.rt;
+		rd = inst.c0r_format.rd;
+		sel = inst.c0r_format.sel;
+
+		switch (inst.c0r_format.rs) {
+		case dmfc_op:
+		case mfc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+			cop0->stat[rd][sel]++;
+#endif
+			if (rd == MIPS_CP0_COUNT &&
+			    sel == 0) {			/* Count */
+				val = kvm_mips_read_count(vcpu);
+			} else if (rd == MIPS_CP0_COMPARE &&
+				   sel == 0) {		/* Compare */
+				val = read_gc0_compare();
+			} else if ((rd == MIPS_CP0_PRID &&
+				    (sel == 0 ||	/* PRid */
+				     sel == 2 ||	/* CDMMBase */
+				     sel == 3)) ||	/* CMGCRBase */
+				   (rd == MIPS_CP0_STATUS &&
+				    (sel == 2 ||	/* SRSCtl */
+				     sel == 3)) ||	/* SRSMap */
+				   (rd == MIPS_CP0_CONFIG &&
+				    (sel == 7)) ||	/* Config7 */
+				   (rd == MIPS_CP0_ERRCTL &&
+				    (sel == 0))) {	/* ErrCtl */
+				val = cop0->reg[rd][sel];
+			} else {
+				val = 0;
+				er = EMULATE_FAIL;
+			}
+
+			if (er != EMULATE_FAIL) {
+				/* Sign extend */
+				if (inst.c0r_format.rs == mfc_op)
+					val = (int)val;
+				vcpu->arch.gprs[rt] = val;
+			}
+
+			trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mfc_op) ?
+					KVM_TRACE_MFC0 : KVM_TRACE_DMFC0,
+				      KVM_TRACE_COP0(rd, sel), val);
+			break;
+
+		case dmtc_op:
+		case mtc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+			cop0->stat[rd][sel]++;
+#endif
+			val = vcpu->arch.gprs[rt];
+			trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mtc_op) ?
+					KVM_TRACE_MTC0 : KVM_TRACE_DMTC0,
+				      KVM_TRACE_COP0(rd, sel), val);
+
+			if (rd == MIPS_CP0_COUNT &&
+			    sel == 0) {			/* Count */
+				kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
+			} else if (rd == MIPS_CP0_COMPARE &&
+				   sel == 0) {		/* Compare */
+				kvm_mips_write_compare(vcpu,
+						       vcpu->arch.gprs[rt],
+						       true);
+			} else if (rd == MIPS_CP0_ERRCTL &&
+				   (sel == 0)) {	/* ErrCtl */
+				/* ignore the written value */
+			} else {
+				er = EMULATE_FAIL;
+			}
+			break;
+
+		default:
+			er = EMULATE_FAIL;
+			break;
+		}
+	}
+	/* Rollback PC only if emulation was unsuccessful */
+	if (er == EMULATE_FAIL) {
+		kvm_err("[%#lx]%s: unsupported cop0 instruction 0x%08x\n",
+			curr_pc, __func__, inst.word);
+
+		vcpu->arch.pc = curr_pc;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst,
+					       u32 *opc, u32 cause,
+					       struct kvm_run *run,
+					       struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	u32 cache, op_inst, op, base;
+	s16 offset;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	unsigned long va, curr_pc;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	base = inst.i_format.rs;
+	op_inst = inst.i_format.rt;
+	if (cpu_has_mips_r6)
+		offset = inst.spec3_format.simmediate;
+	else
+		offset = inst.i_format.simmediate;
+	cache = op_inst & CacheOp_Cache;
+	op = op_inst & CacheOp_Op;
+
+	va = arch->gprs[base] + offset;
+
+	kvm_debug("CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+		  cache, op, base, arch->gprs[base], offset);
+
+	/* Secondary or tirtiary cache ops ignored */
+	if (cache != Cache_I && cache != Cache_D)
+		return EMULATE_DONE;
+
+	switch (op_inst) {
+	case Index_Invalidate_I:
+		flush_icache_line_indexed(va);
+		return EMULATE_DONE;
+	case Index_Writeback_Inv_D:
+		flush_dcache_line_indexed(va);
+		return EMULATE_DONE;
+	default:
+		break;
+	};
+
+	kvm_err("@ %#lx/%#lx CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+		curr_pc, vcpu->arch.gprs[31], cache, op, base, arch->gprs[base],
+		offset);
+	/* Rollback PC */
+	vcpu->arch.pc = curr_pc;
+
+	return EMULATE_FAIL;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	struct kvm_run *run = vcpu->run;
+	union mips_instruction inst;
+	int rd, rt, sel;
+	int err;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	switch (inst.r_format.opcode) {
+	case cop0_op:
+		er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu);
+		break;
+#ifndef CONFIG_CPU_MIPSR6
+	case cache_op:
+		trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+		er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+		break;
+#endif
+	case spec3_op:
+		switch (inst.spec3_format.func) {
+#ifdef CONFIG_CPU_MIPSR6
+		case cache6_op:
+			trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+			er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+			break;
+#endif
+		case rdhwr_op:
+			if (inst.r_format.rs || (inst.r_format.re >> 3))
+				goto unknown;
+
+			rd = inst.r_format.rd;
+			rt = inst.r_format.rt;
+			sel = inst.r_format.re & 0x7;
+
+			switch (rd) {
+			case MIPS_HWR_CC:	/* Read count register */
+				arch->gprs[rt] =
+					(long)(int)kvm_mips_read_count(vcpu);
+				break;
+			default:
+				trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+					      KVM_TRACE_HWR(rd, sel), 0);
+				goto unknown;
+			};
+
+			trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+				      KVM_TRACE_HWR(rd, sel), arch->gprs[rt]);
+
+			er = update_pc(vcpu, cause);
+			break;
+		default:
+			goto unknown;
+		};
+		break;
+unknown:
+
+	default:
+		kvm_err("GPSI exception not supported (%p/%#x)\n",
+				opc, inst.word);
+		kvm_arch_vcpu_dump_regs(vcpu);
+		er = EMULATE_FAIL;
+		break;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gsfc(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	union mips_instruction inst;
+	int err;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	/* complete MTC0 on behalf of guest and advance EPC */
+	if (inst.c0r_format.opcode == cop0_op &&
+	    inst.c0r_format.rs == mtc_op &&
+	    inst.c0r_format.z == 0) {
+		int rt = inst.c0r_format.rt;
+		int rd = inst.c0r_format.rd;
+		int sel = inst.c0r_format.sel;
+		unsigned int val = arch->gprs[rt];
+		unsigned int old_val, change;
+
+		trace_kvm_hwr(vcpu, KVM_TRACE_MTC0, KVM_TRACE_COP0(rd, sel),
+			      val);
+
+		if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
+			/* FR bit should read as zero if no FPU */
+			if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+				val &= ~(ST0_CU1 | ST0_FR);
+
+			/*
+			 * Also don't allow FR to be set if host doesn't support
+			 * it.
+			 */
+			if (!(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+				val &= ~ST0_FR;
+
+			old_val = read_gc0_status();
+			change = val ^ old_val;
+
+			if (change & ST0_FR) {
+				/*
+				 * FPU and Vector register state is made
+				 * UNPREDICTABLE by a change of FR, so don't
+				 * even bother saving it.
+				 */
+				kvm_drop_fpu(vcpu);
+			}
+
+			/*
+			 * If MSA state is already live, it is undefined how it
+			 * interacts with FR=0 FPU state, and we don't want to
+			 * hit reserved instruction exceptions trying to save
+			 * the MSA state later when CU=1 && FR=1, so play it
+			 * safe and save it first.
+			 */
+			if (change & ST0_CU1 && !(val & ST0_FR) &&
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
+				kvm_lose_fpu(vcpu);
+
+			write_gc0_status(val);
+		} else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
+			u32 old_cause = read_gc0_cause();
+			u32 change = old_cause ^ val;
+
+			/* DC bit enabling/disabling timer? */
+			if (change & CAUSEF_DC) {
+				if (val & CAUSEF_DC)
+					kvm_mips_count_disable_cause(vcpu);
+				else
+					kvm_mips_count_enable_cause(vcpu);
+			}
+
+			/* Only certain bits are RW to the guest */
+			change &= (CAUSEF_DC | CAUSEF_IV | CAUSEF_WP |
+				   CAUSEF_IP0 | CAUSEF_IP1);
+
+			/* WP can only be cleared */
+			change &= ~CAUSEF_WP | old_cause;
+
+			write_gc0_cause(old_cause ^ change);
+		} else if ((rd == MIPS_CP0_STATUS) && (sel == 1)) { /* IntCtl */
+			write_gc0_intctl(val);
+		} else if ((rd == MIPS_CP0_CONFIG) && (sel == 5)) {
+			old_val = read_gc0_config5();
+			change = val ^ old_val;
+			/* Handle changes in FPU/MSA modes */
+			preempt_disable();
+
+			/*
+			 * Propagate FRE changes immediately if the FPU
+			 * context is already loaded.
+			 */
+			if (change & MIPS_CONF5_FRE &&
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
+				change_c0_config5(MIPS_CONF5_FRE, val);
+
+			preempt_enable();
+
+			val = old_val ^
+				(change & kvm_vz_config5_guest_wrmask(vcpu));
+			write_gc0_config5(val);
+		} else {
+			kvm_err("Handle GSFC, unsupported field change @ %p: %#x\n",
+			    opc, inst.word);
+			er = EMULATE_FAIL;
+		}
+
+		if (er != EMULATE_FAIL)
+			er = update_pc(vcpu, cause);
+	} else {
+		kvm_err("Handle GSFC, unrecognized instruction @ %p: %#x\n",
+			opc, inst.word);
+		er = EMULATE_FAIL;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_hc(u32 cause, u32 *opc,
+						   struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	union mips_instruction inst;
+	unsigned long curr_pc;
+	int err;
+
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	er = kvm_mips_emul_hypcall(vcpu, inst);
+	if (er == EMULATE_FAIL)
+		vcpu->arch.pc = curr_pc;
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_no_handler_guest_exit(u32 gexccode,
+							u32 cause,
+							u32 *opc,
+							struct kvm_vcpu *vcpu)
+{
+	u32 inst;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Guest Exception Code: %d not yet handled @ PC: %p, inst: 0x%08x  Status: %#x\n",
+		gexccode, opc, inst, read_gc0_status());
+
+	return EMULATE_FAIL;
+}
+
+static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu)
+{
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	enum emulation_result er = EMULATE_DONE;
+	u32 gexccode = (vcpu->arch.host_cp0_guestctl0 &
+			MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+	int ret = RESUME_GUEST;
+
+	trace_kvm_exit(vcpu, KVM_TRACE_EXIT_GEXCCODE_BASE + gexccode);
+	switch (gexccode) {
+	case MIPS_GCTL0_GEXC_GPSI:
+		++vcpu->stat.vz_gpsi_exits;
+		er = kvm_trap_vz_handle_gpsi(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GSFC:
+		++vcpu->stat.vz_gsfc_exits;
+		er = kvm_trap_vz_handle_gsfc(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_HC:
+		++vcpu->stat.vz_hc_exits;
+		er = kvm_trap_vz_handle_hc(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GRR:
+		++vcpu->stat.vz_grr_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GVA:
+		++vcpu->stat.vz_gva_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GHFC:
+		++vcpu->stat.vz_ghfc_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GPA:
+		++vcpu->stat.vz_gpa_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	default:
+		++vcpu->stat.vz_resvd_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_HYPERCALL) {
+		ret = kvm_mips_handle_hypcall(vcpu);
+	} else {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor.
+ * @vcpu:	Virtual CPU context.
+ *
+ * Handle when the guest attempts to use a coprocessor which hasn't been allowed
+ * by the root context.
+ */
+static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	enum emulation_result er = EMULATE_FAIL;
+	int ret = RESUME_GUEST;
+
+	if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1) {
+		/*
+		 * If guest FPU not present, the FPU operation should have been
+		 * treated as a reserved instruction!
+		 * If FPU already in use, we shouldn't get this at all.
+		 */
+		if (WARN_ON(!kvm_mips_guest_has_fpu(&vcpu->arch) ||
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
+			preempt_enable();
+			return EMULATE_FAIL;
+		}
+
+		kvm_own_fpu(vcpu);
+		er = EMULATE_DONE;
+	}
+	/* other coprocessors not handled */
+
+	switch (er) {
+	case EMULATE_DONE:
+		ret = RESUME_GUEST;
+		break;
+
+	case EMULATE_FAIL:
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+		break;
+
+	default:
+		BUG();
+	}
+	return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_msa_disabled() - Guest used MSA while disabled in root.
+ * @vcpu:	Virtual CPU context.
+ *
+ * Handle when the guest attempts to use MSA when it is disabled in the root
+ * context.
+ */
+static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	/*
+	 * If MSA not present or not exposed to guest or FR=0, the MSA operation
+	 * should have been treated as a reserved instruction!
+	 * Same if CU1=1, FR=0.
+	 * If MSA already in use, we shouldn't get this at all.
+	 */
+	if (!kvm_mips_guest_has_msa(&vcpu->arch) ||
+	    (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 ||
+	    !(read_gc0_config5() & MIPS_CONF5_MSAEN) ||
+	    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	kvm_own_msa(vcpu);
+
+	return RESUME_GUEST;
+}
+
+static int kvm_trap_vz_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+	union mips_instruction inst;
+	enum emulation_result er = EMULATE_DONE;
+	int err, ret = RESUME_GUEST;
+
+	if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, false)) {
+		/* A code fetch fault doesn't count as an MMIO */
+		if (kvm_is_ifetch_fault(&vcpu->arch)) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Fetch the instruction */
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		err = kvm_get_badinstr(opc, vcpu, &inst.word);
+		if (err) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Treat as MMIO */
+		er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+		if (er == EMULATE_FAIL) {
+			kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+				opc, badvaddr);
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		}
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_DO_MMIO) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		ret = RESUME_HOST;
+	} else {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+static int kvm_trap_vz_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+	union mips_instruction inst;
+	enum emulation_result er = EMULATE_DONE;
+	int err;
+	int ret = RESUME_GUEST;
+
+	/* Just try the access again if we couldn't do the translation */
+	if (kvm_vz_badvaddr_to_gpa(vcpu, badvaddr, &badvaddr))
+		return RESUME_GUEST;
+	vcpu->arch.host_cp0_badvaddr = badvaddr;
+
+	if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, true)) {
+		/* Fetch the instruction */
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		err = kvm_get_badinstr(opc, vcpu, &inst.word);
+		if (err) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Treat as MMIO */
+		er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+		if (er == EMULATE_FAIL) {
+			kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+				opc, badvaddr);
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		}
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_DO_MMIO) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		ret = RESUME_HOST;
+	} else {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+static u64 kvm_vz_get_one_regs[] = {
+	KVM_REG_MIPS_CP0_INDEX,
+	KVM_REG_MIPS_CP0_ENTRYLO0,
+	KVM_REG_MIPS_CP0_ENTRYLO1,
+	KVM_REG_MIPS_CP0_CONTEXT,
+	KVM_REG_MIPS_CP0_PAGEMASK,
+	KVM_REG_MIPS_CP0_PAGEGRAIN,
+	KVM_REG_MIPS_CP0_WIRED,
+	KVM_REG_MIPS_CP0_HWRENA,
+	KVM_REG_MIPS_CP0_BADVADDR,
+	KVM_REG_MIPS_CP0_COUNT,
+	KVM_REG_MIPS_CP0_ENTRYHI,
+	KVM_REG_MIPS_CP0_COMPARE,
+	KVM_REG_MIPS_CP0_STATUS,
+	KVM_REG_MIPS_CP0_INTCTL,
+	KVM_REG_MIPS_CP0_CAUSE,
+	KVM_REG_MIPS_CP0_EPC,
+	KVM_REG_MIPS_CP0_PRID,
+	KVM_REG_MIPS_CP0_EBASE,
+	KVM_REG_MIPS_CP0_CONFIG,
+	KVM_REG_MIPS_CP0_CONFIG1,
+	KVM_REG_MIPS_CP0_CONFIG2,
+	KVM_REG_MIPS_CP0_CONFIG3,
+	KVM_REG_MIPS_CP0_CONFIG4,
+	KVM_REG_MIPS_CP0_CONFIG5,
+#ifdef CONFIG_64BIT
+	KVM_REG_MIPS_CP0_XCONTEXT,
+#endif
+	KVM_REG_MIPS_CP0_ERROREPC,
+
+	KVM_REG_MIPS_COUNT_CTL,
+	KVM_REG_MIPS_COUNT_RESUME,
+	KVM_REG_MIPS_COUNT_HZ,
+};
+
+static u64 kvm_vz_get_one_regs_kscratch[] = {
+	KVM_REG_MIPS_CP0_KSCRATCH1,
+	KVM_REG_MIPS_CP0_KSCRATCH2,
+	KVM_REG_MIPS_CP0_KSCRATCH3,
+	KVM_REG_MIPS_CP0_KSCRATCH4,
+	KVM_REG_MIPS_CP0_KSCRATCH5,
+	KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
+{
+	unsigned long ret;
+
+	ret = ARRAY_SIZE(kvm_vz_get_one_regs);
+	if (cpu_guest_has_userlocal)
+		++ret;
+	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
+
+	return ret;
+}
+
+static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+	u64 index;
+	unsigned int i;
+
+	if (copy_to_user(indices, kvm_vz_get_one_regs,
+			 sizeof(kvm_vz_get_one_regs)))
+		return -EFAULT;
+	indices += ARRAY_SIZE(kvm_vz_get_one_regs);
+
+	if (cpu_guest_has_userlocal) {
+		index = KVM_REG_MIPS_CP0_USERLOCAL;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	for (i = 0; i < 6; ++i) {
+		if (!cpu_guest_has_kscr(i + 2))
+			continue;
+
+		if (copy_to_user(indices, &kvm_vz_get_one_regs_kscratch[i],
+				 sizeof(kvm_vz_get_one_regs_kscratch[i])))
+			return -EFAULT;
+		++indices;
+	}
+
+	return 0;
+}
+
+static inline s64 entrylo_kvm_to_user(unsigned long v)
+{
+	s64 mask, ret = v;
+
+	if (BITS_PER_LONG == 32) {
+		/*
+		 * KVM API exposes 64-bit version of the register, so move the
+		 * RI/XI bits up into place.
+		 */
+		mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+		ret &= ~mask;
+		ret |= ((s64)v & mask) << 32;
+	}
+	return ret;
+}
+
+static inline unsigned long entrylo_user_to_kvm(s64 v)
+{
+	unsigned long mask, ret = v;
+
+	if (BITS_PER_LONG == 32) {
+		/*
+		 * KVM API exposes 64-bit versiono of the register, so move the
+		 * RI/XI bits down into place.
+		 */
+		mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+		ret &= ~mask;
+		ret |= (v >> 32) & mask;
+	}
+	return ret;
+}
+
+static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
+			      const struct kvm_one_reg *reg,
+			      s64 *v)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned int idx;
+
+	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		*v = (long)read_gc0_index();
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		*v = entrylo_kvm_to_user(read_gc0_entrylo0());
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		*v = entrylo_kvm_to_user(read_gc0_entrylo1());
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		*v = (long)read_gc0_context();
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		if (!cpu_guest_has_userlocal)
+			return -EINVAL;
+		*v = read_gc0_userlocal();
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		*v = (long)read_gc0_pagemask();
+		break;
+	case KVM_REG_MIPS_CP0_PAGEGRAIN:
+		*v = (long)read_gc0_pagegrain();
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		*v = (long)read_gc0_wired();
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		*v = (long)read_gc0_hwrena();
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		*v = (long)read_gc0_badvaddr();
+		break;
+	case KVM_REG_MIPS_CP0_COUNT:
+		*v = kvm_mips_read_count(vcpu);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		*v = (long)read_gc0_entryhi();
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		*v = (long)read_gc0_compare();
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		*v = (long)read_gc0_status();
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		*v = read_gc0_intctl();
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		*v = (long)read_gc0_cause();
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		*v = (long)read_gc0_epc();
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		*v = (long)kvm_read_c0_guest_prid(cop0);
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		*v = kvm_vz_read_gc0_ebase();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		*v = read_gc0_config();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		if (!cpu_guest_has_conf1)
+			return -EINVAL;
+		*v = read_gc0_config1();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		if (!cpu_guest_has_conf2)
+			return -EINVAL;
+		*v = read_gc0_config2();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		if (!cpu_guest_has_conf3)
+			return -EINVAL;
+		*v = read_gc0_config3();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		if (!cpu_guest_has_conf4)
+			return -EINVAL;
+		*v = read_gc0_config4();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		if (!cpu_guest_has_conf5)
+			return -EINVAL;
+		*v = read_gc0_config5();
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXT:
+		*v = read_gc0_xcontext();
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		*v = (long)read_gc0_errorepc();
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!cpu_guest_has_kscr(idx))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			*v = (long)read_gc0_kscratch1();
+			break;
+		case 3:
+			*v = (long)read_gc0_kscratch2();
+			break;
+		case 4:
+			*v = (long)read_gc0_kscratch3();
+			break;
+		case 5:
+			*v = (long)read_gc0_kscratch4();
+			break;
+		case 6:
+			*v = (long)read_gc0_kscratch5();
+			break;
+		case 7:
+			*v = (long)read_gc0_kscratch6();
+			break;
+		}
+		break;
+	case KVM_REG_MIPS_COUNT_CTL:
+		*v = vcpu->arch.count_ctl;
+		break;
+	case KVM_REG_MIPS_COUNT_RESUME:
+		*v = ktime_to_ns(vcpu->arch.count_resume);
+		break;
+	case KVM_REG_MIPS_COUNT_HZ:
+		*v = vcpu->arch.count_hz;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
+			      const struct kvm_one_reg *reg,
+			      s64 v)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned int idx;
+	int ret = 0;
+	unsigned int cur, change;
+
+	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		write_gc0_index(v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		write_gc0_entrylo0(entrylo_user_to_kvm(v));
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		write_gc0_entrylo1(entrylo_user_to_kvm(v));
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		write_gc0_context(v);
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		if (!cpu_guest_has_userlocal)
+			return -EINVAL;
+		write_gc0_userlocal(v);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		write_gc0_pagemask(v);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEGRAIN:
+		write_gc0_pagegrain(v);
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		change_gc0_wired(MIPSR6_WIRED_WIRED, v);
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		write_gc0_hwrena(v);
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		write_gc0_badvaddr(v);
+		break;
+	case KVM_REG_MIPS_CP0_COUNT:
+		kvm_mips_write_count(vcpu, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		write_gc0_entryhi(v);
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		kvm_mips_write_compare(vcpu, v, false);
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		write_gc0_status(v);
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		write_gc0_intctl(v);
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		/*
+		 * If the timer is stopped or started (DC bit) it must look
+		 * atomic with changes to the timer interrupt pending bit (TI).
+		 * A timer interrupt should not happen in between.
+		 */
+		if ((read_gc0_cause() ^ v) & CAUSEF_DC) {
+			if (v & CAUSEF_DC) {
+				/* disable timer first */
+				kvm_mips_count_disable_cause(vcpu);
+				change_gc0_cause((u32)~CAUSEF_DC, v);
+			} else {
+				/* enable timer last */
+				change_gc0_cause((u32)~CAUSEF_DC, v);
+				kvm_mips_count_enable_cause(vcpu);
+			}
+		} else {
+			write_gc0_cause(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		write_gc0_epc(v);
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		kvm_write_c0_guest_prid(cop0, v);
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		kvm_vz_write_gc0_ebase(v);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		cur = read_gc0_config();
+		change = (cur ^ v) & kvm_vz_config_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		if (!cpu_guest_has_conf1)
+			break;
+		cur = read_gc0_config1();
+		change = (cur ^ v) & kvm_vz_config1_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config1(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		if (!cpu_guest_has_conf2)
+			break;
+		cur = read_gc0_config2();
+		change = (cur ^ v) & kvm_vz_config2_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config2(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		if (!cpu_guest_has_conf3)
+			break;
+		cur = read_gc0_config3();
+		change = (cur ^ v) & kvm_vz_config3_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config3(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		if (!cpu_guest_has_conf4)
+			break;
+		cur = read_gc0_config4();
+		change = (cur ^ v) & kvm_vz_config4_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config4(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		if (!cpu_guest_has_conf5)
+			break;
+		cur = read_gc0_config5();
+		change = (cur ^ v) & kvm_vz_config5_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config5(v);
+		}
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXT:
+		write_gc0_xcontext(v);
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		write_gc0_errorepc(v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!cpu_guest_has_kscr(idx))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			write_gc0_kscratch1(v);
+			break;
+		case 3:
+			write_gc0_kscratch2(v);
+			break;
+		case 4:
+			write_gc0_kscratch3(v);
+			break;
+		case 5:
+			write_gc0_kscratch4(v);
+			break;
+		case 6:
+			write_gc0_kscratch5(v);
+			break;
+		case 7:
+			write_gc0_kscratch6(v);
+			break;
+		}
+		break;
+	case KVM_REG_MIPS_COUNT_CTL:
+		ret = kvm_mips_set_count_ctl(vcpu, v);
+		break;
+	case KVM_REG_MIPS_COUNT_RESUME:
+		ret = kvm_mips_set_count_resume(vcpu, v);
+		break;
+	case KVM_REG_MIPS_COUNT_HZ:
+		ret = kvm_mips_set_count_hz(vcpu, v);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return ret;
+}
+
+#define guestid_cache(cpu)	(cpu_data[cpu].guestid_cache)
+static void kvm_vz_get_new_guestid(unsigned long cpu, struct kvm_vcpu *vcpu)
+{
+	unsigned long guestid = guestid_cache(cpu);
+
+	if (!(++guestid & GUESTID_MASK)) {
+		if (cpu_has_vtag_icache)
+			flush_icache_all();
+
+		if (!guestid)		/* fix version if needed */
+			guestid = GUESTID_FIRST_VERSION;
+
+		++guestid;		/* guestid 0 reserved for root */
+
+		/* start new guestid cycle */
+		kvm_vz_local_flush_roottlb_all_guests();
+		kvm_vz_local_flush_guesttlb_all();
+	}
+
+	guestid_cache(cpu) = guestid;
+}
+
+/* Returns 1 if the guest TLB may be clobbered */
+static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu)
+{
+	int ret = 0;
+	int i;
+
+	if (!vcpu->requests)
+		return 0;
+
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+		if (cpu_has_guestid) {
+			/* Drop all GuestIDs for this VCPU */
+			for_each_possible_cpu(i)
+				vcpu->arch.vzguestid[i] = 0;
+			/* This will clobber guest TLB contents too */
+			ret = 1;
+		}
+		/*
+		 * For Root ASID Dealias (RAD) we don't do anything here, but we
+		 * still need the request to ensure we recheck asid_flush_mask.
+		 * We can still return 0 as only the root TLB will be affected
+		 * by a root ASID flush.
+		 */
+	}
+
+	return ret;
+}
+
+static void kvm_vz_vcpu_save_wired(struct kvm_vcpu *vcpu)
+{
+	unsigned int wired = read_gc0_wired();
+	struct kvm_mips_tlb *tlbs;
+	int i;
+
+	/* Expand the wired TLB array if necessary */
+	wired &= MIPSR6_WIRED_WIRED;
+	if (wired > vcpu->arch.wired_tlb_limit) {
+		tlbs = krealloc(vcpu->arch.wired_tlb, wired *
+				sizeof(*vcpu->arch.wired_tlb), GFP_ATOMIC);
+		if (WARN_ON(!tlbs)) {
+			/* Save whatever we can */
+			wired = vcpu->arch.wired_tlb_limit;
+		} else {
+			vcpu->arch.wired_tlb = tlbs;
+			vcpu->arch.wired_tlb_limit = wired;
+		}
+	}
+
+	if (wired)
+		/* Save wired entries from the guest TLB */
+		kvm_vz_save_guesttlb(vcpu->arch.wired_tlb, 0, wired);
+	/* Invalidate any dropped entries since last time */
+	for (i = wired; i < vcpu->arch.wired_tlb_used; ++i) {
+		vcpu->arch.wired_tlb[i].tlb_hi = UNIQUE_GUEST_ENTRYHI(i);
+		vcpu->arch.wired_tlb[i].tlb_lo[0] = 0;
+		vcpu->arch.wired_tlb[i].tlb_lo[1] = 0;
+		vcpu->arch.wired_tlb[i].tlb_mask = 0;
+	}
+	vcpu->arch.wired_tlb_used = wired;
+}
+
+static void kvm_vz_vcpu_load_wired(struct kvm_vcpu *vcpu)
+{
+	/* Load wired entries into the guest TLB */
+	if (vcpu->arch.wired_tlb)
+		kvm_vz_load_guesttlb(vcpu->arch.wired_tlb, 0,
+				     vcpu->arch.wired_tlb_used);
+}
+
+static void kvm_vz_vcpu_load_tlb(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct mm_struct *gpa_mm = &kvm->arch.gpa_mm;
+	bool migrated;
+
+	/*
+	 * Are we entering guest context on a different CPU to last time?
+	 * If so, the VCPU's guest TLB state on this CPU may be stale.
+	 */
+	migrated = (vcpu->arch.last_exec_cpu != cpu);
+	vcpu->arch.last_exec_cpu = cpu;
+
+	/*
+	 * A vcpu's GuestID is set in GuestCtl1.ID when the vcpu is loaded and
+	 * remains set until another vcpu is loaded in.  As a rule GuestRID
+	 * remains zeroed when in root context unless the kernel is busy
+	 * manipulating guest tlb entries.
+	 */
+	if (cpu_has_guestid) {
+		/*
+		 * Check if our GuestID is of an older version and thus invalid.
+		 *
+		 * We also discard the stored GuestID if we've executed on
+		 * another CPU, as the guest mappings may have changed without
+		 * hypervisor knowledge.
+		 */
+		if (migrated ||
+		    (vcpu->arch.vzguestid[cpu] ^ guestid_cache(cpu)) &
+					GUESTID_VERSION_MASK) {
+			kvm_vz_get_new_guestid(cpu, vcpu);
+			vcpu->arch.vzguestid[cpu] = guestid_cache(cpu);
+			trace_kvm_guestid_change(vcpu,
+						 vcpu->arch.vzguestid[cpu]);
+		}
+
+		/* Restore GuestID */
+		change_c0_guestctl1(GUESTID_MASK, vcpu->arch.vzguestid[cpu]);
+	} else {
+		/*
+		 * The Guest TLB only stores a single guest's TLB state, so
+		 * flush it if another VCPU has executed on this CPU.
+		 *
+		 * We also flush if we've executed on another CPU, as the guest
+		 * mappings may have changed without hypervisor knowledge.
+		 */
+		if (migrated || last_exec_vcpu[cpu] != vcpu)
+			kvm_vz_local_flush_guesttlb_all();
+		last_exec_vcpu[cpu] = vcpu;
+
+		/*
+		 * Root ASID dealiases guest GPA mappings in the root TLB.
+		 * Allocate new root ASID if needed.
+		 */
+		if (cpumask_test_and_clear_cpu(cpu, &kvm->arch.asid_flush_mask)
+		    || (cpu_context(cpu, gpa_mm) ^ asid_cache(cpu)) &
+						asid_version_mask(cpu))
+			get_new_mmu_context(gpa_mm, cpu);
+	}
+}
+
+static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	bool migrated, all;
+
+	/*
+	 * Have we migrated to a different CPU?
+	 * If so, any old guest TLB state may be stale.
+	 */
+	migrated = (vcpu->arch.last_sched_cpu != cpu);
+
+	/*
+	 * Was this the last VCPU to run on this CPU?
+	 * If not, any old guest state from this VCPU will have been clobbered.
+	 */
+	all = migrated || (last_vcpu[cpu] != vcpu);
+	last_vcpu[cpu] = vcpu;
+
+	/*
+	 * Restore CP0_Wired unconditionally as we clear it after use, and
+	 * restore wired guest TLB entries (while in guest context).
+	 */
+	kvm_restore_gc0_wired(cop0);
+	if (current->flags & PF_VCPU) {
+		tlbw_use_hazard();
+		kvm_vz_vcpu_load_tlb(vcpu, cpu);
+		kvm_vz_vcpu_load_wired(vcpu);
+	}
+
+	/*
+	 * Restore timer state regardless, as e.g. Cause.TI can change over time
+	 * if left unmaintained.
+	 */
+	kvm_vz_restore_timer(vcpu);
+
+	/* Don't bother restoring registers multiple times unless necessary */
+	if (!all)
+		return 0;
+
+	/*
+	 * Restore config registers first, as some implementations restrict
+	 * writes to other registers when the corresponding feature bits aren't
+	 * set. For example Status.CU1 cannot be set unless Config1.FP is set.
+	 */
+	kvm_restore_gc0_config(cop0);
+	if (cpu_guest_has_conf1)
+		kvm_restore_gc0_config1(cop0);
+	if (cpu_guest_has_conf2)
+		kvm_restore_gc0_config2(cop0);
+	if (cpu_guest_has_conf3)
+		kvm_restore_gc0_config3(cop0);
+	if (cpu_guest_has_conf4)
+		kvm_restore_gc0_config4(cop0);
+	if (cpu_guest_has_conf5)
+		kvm_restore_gc0_config5(cop0);
+	if (cpu_guest_has_conf6)
+		kvm_restore_gc0_config6(cop0);
+	if (cpu_guest_has_conf7)
+		kvm_restore_gc0_config7(cop0);
+
+	kvm_restore_gc0_index(cop0);
+	kvm_restore_gc0_entrylo0(cop0);
+	kvm_restore_gc0_entrylo1(cop0);
+	kvm_restore_gc0_context(cop0);
+#ifdef CONFIG_64BIT
+	kvm_restore_gc0_xcontext(cop0);
+#endif
+	kvm_restore_gc0_pagemask(cop0);
+	kvm_restore_gc0_pagegrain(cop0);
+	kvm_restore_gc0_hwrena(cop0);
+	kvm_restore_gc0_badvaddr(cop0);
+	kvm_restore_gc0_entryhi(cop0);
+	kvm_restore_gc0_status(cop0);
+	kvm_restore_gc0_intctl(cop0);
+	kvm_restore_gc0_epc(cop0);
+	kvm_vz_write_gc0_ebase(kvm_read_sw_gc0_ebase(cop0));
+	if (cpu_guest_has_userlocal)
+		kvm_restore_gc0_userlocal(cop0);
+
+	kvm_restore_gc0_errorepc(cop0);
+
+	/* restore KScratch registers if enabled in guest */
+	if (cpu_guest_has_conf4) {
+		if (cpu_guest_has_kscr(2))
+			kvm_restore_gc0_kscratch1(cop0);
+		if (cpu_guest_has_kscr(3))
+			kvm_restore_gc0_kscratch2(cop0);
+		if (cpu_guest_has_kscr(4))
+			kvm_restore_gc0_kscratch3(cop0);
+		if (cpu_guest_has_kscr(5))
+			kvm_restore_gc0_kscratch4(cop0);
+		if (cpu_guest_has_kscr(6))
+			kvm_restore_gc0_kscratch5(cop0);
+		if (cpu_guest_has_kscr(7))
+			kvm_restore_gc0_kscratch6(cop0);
+	}
+
+	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
+	if (cpu_has_guestctl2)
+		write_c0_guestctl2(
+			cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL]);
+
+	return 0;
+}
+
+static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	if (current->flags & PF_VCPU)
+		kvm_vz_vcpu_save_wired(vcpu);
+
+	kvm_lose_fpu(vcpu);
+
+	kvm_save_gc0_index(cop0);
+	kvm_save_gc0_entrylo0(cop0);
+	kvm_save_gc0_entrylo1(cop0);
+	kvm_save_gc0_context(cop0);
+#ifdef CONFIG_64BIT
+	kvm_save_gc0_xcontext(cop0);
+#endif
+	kvm_save_gc0_pagemask(cop0);
+	kvm_save_gc0_pagegrain(cop0);
+	kvm_save_gc0_wired(cop0);
+	/* allow wired TLB entries to be overwritten */
+	clear_gc0_wired(MIPSR6_WIRED_WIRED);
+	kvm_save_gc0_hwrena(cop0);
+	kvm_save_gc0_badvaddr(cop0);
+	kvm_save_gc0_entryhi(cop0);
+	kvm_save_gc0_status(cop0);
+	kvm_save_gc0_intctl(cop0);
+	kvm_save_gc0_epc(cop0);
+	kvm_write_sw_gc0_ebase(cop0, kvm_vz_read_gc0_ebase());
+	if (cpu_guest_has_userlocal)
+		kvm_save_gc0_userlocal(cop0);
+
+	/* only save implemented config registers */
+	kvm_save_gc0_config(cop0);
+	if (cpu_guest_has_conf1)
+		kvm_save_gc0_config1(cop0);
+	if (cpu_guest_has_conf2)
+		kvm_save_gc0_config2(cop0);
+	if (cpu_guest_has_conf3)
+		kvm_save_gc0_config3(cop0);
+	if (cpu_guest_has_conf4)
+		kvm_save_gc0_config4(cop0);
+	if (cpu_guest_has_conf5)
+		kvm_save_gc0_config5(cop0);
+	if (cpu_guest_has_conf6)
+		kvm_save_gc0_config6(cop0);
+	if (cpu_guest_has_conf7)
+		kvm_save_gc0_config7(cop0);
+
+	kvm_save_gc0_errorepc(cop0);
+
+	/* save KScratch registers if enabled in guest */
+	if (cpu_guest_has_conf4) {
+		if (cpu_guest_has_kscr(2))
+			kvm_save_gc0_kscratch1(cop0);
+		if (cpu_guest_has_kscr(3))
+			kvm_save_gc0_kscratch2(cop0);
+		if (cpu_guest_has_kscr(4))
+			kvm_save_gc0_kscratch3(cop0);
+		if (cpu_guest_has_kscr(5))
+			kvm_save_gc0_kscratch4(cop0);
+		if (cpu_guest_has_kscr(6))
+			kvm_save_gc0_kscratch5(cop0);
+		if (cpu_guest_has_kscr(7))
+			kvm_save_gc0_kscratch6(cop0);
+	}
+
+	kvm_vz_save_timer(vcpu);
+
+	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
+	if (cpu_has_guestctl2)
+		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] =
+			read_c0_guestctl2();
+
+	return 0;
+}
+
+/**
+ * kvm_vz_resize_guest_vtlb() - Attempt to resize guest VTLB.
+ * @size:	Number of guest VTLB entries (0 < @size <= root VTLB entries).
+ *
+ * Attempt to resize the guest VTLB by writing guest Config registers. This is
+ * necessary for cores with a shared root/guest TLB to avoid overlap with wired
+ * entries in the root VTLB.
+ *
+ * Returns:	The resulting guest VTLB size.
+ */
+static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
+{
+	unsigned int config4 = 0, ret = 0, limit;
+
+	/* Write MMUSize - 1 into guest Config registers */
+	if (cpu_guest_has_conf1)
+		change_gc0_config1(MIPS_CONF1_TLBS,
+				   (size - 1) << MIPS_CONF1_TLBS_SHIFT);
+	if (cpu_guest_has_conf4) {
+		config4 = read_gc0_config4();
+		if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+		    MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT) {
+			config4 &= ~MIPS_CONF4_VTLBSIZEEXT;
+			config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+				MIPS_CONF4_VTLBSIZEEXT_SHIFT;
+		} else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+			   MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT) {
+			config4 &= ~MIPS_CONF4_MMUSIZEEXT;
+			config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+				MIPS_CONF4_MMUSIZEEXT_SHIFT;
+		}
+		write_gc0_config4(config4);
+	}
+
+	/*
+	 * Set Guest.Wired.Limit = 0 (no limit up to Guest.MMUSize-1), unless it
+	 * would exceed Root.Wired.Limit (clearing Guest.Wired.Wired so write
+	 * not dropped)
+	 */
+	if (cpu_has_mips_r6) {
+		limit = (read_c0_wired() & MIPSR6_WIRED_LIMIT) >>
+						MIPSR6_WIRED_LIMIT_SHIFT;
+		if (size - 1 <= limit)
+			limit = 0;
+		write_gc0_wired(limit << MIPSR6_WIRED_LIMIT_SHIFT);
+	}
+
+	/* Read back MMUSize - 1 */
+	back_to_back_c0_hazard();
+	if (cpu_guest_has_conf1)
+		ret = (read_gc0_config1() & MIPS_CONF1_TLBS) >>
+						MIPS_CONF1_TLBS_SHIFT;
+	if (config4) {
+		if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+		    MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT)
+			ret |= ((config4 & MIPS_CONF4_VTLBSIZEEXT) >>
+				MIPS_CONF4_VTLBSIZEEXT_SHIFT) <<
+				MIPS_CONF1_TLBS_SIZE;
+		else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+			 MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT)
+			ret |= ((config4 & MIPS_CONF4_MMUSIZEEXT) >>
+				MIPS_CONF4_MMUSIZEEXT_SHIFT) <<
+				MIPS_CONF1_TLBS_SIZE;
+	}
+	return ret + 1;
+}
+
+static int kvm_vz_hardware_enable(void)
+{
+	unsigned int mmu_size, guest_mmu_size, ftlb_size;
+
+	/*
+	 * ImgTec cores tend to use a shared root/guest TLB. To avoid overlap of
+	 * root wired and guest entries, the guest TLB may need resizing.
+	 */
+	mmu_size = current_cpu_data.tlbsizevtlb;
+	ftlb_size = current_cpu_data.tlbsize - mmu_size;
+
+	/* Try switching to maximum guest VTLB size for flush */
+	guest_mmu_size = kvm_vz_resize_guest_vtlb(mmu_size);
+	current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+	kvm_vz_local_flush_guesttlb_all();
+
+	/*
+	 * Reduce to make space for root wired entries and at least 2 root
+	 * non-wired entries. This does assume that long-term wired entries
+	 * won't be added later.
+	 */
+	guest_mmu_size = mmu_size - num_wired_entries() - 2;
+	guest_mmu_size = kvm_vz_resize_guest_vtlb(guest_mmu_size);
+	current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+
+	/*
+	 * Write the VTLB size, but if another CPU has already written, check it
+	 * matches or we won't provide a consistent view to the guest. If this
+	 * ever happens it suggests an asymmetric number of wired entries.
+	 */
+	if (cmpxchg(&kvm_vz_guest_vtlb_size, 0, guest_mmu_size) &&
+	    WARN(guest_mmu_size != kvm_vz_guest_vtlb_size,
+		 "Available guest VTLB size mismatch"))
+		return -EINVAL;
+
+	/*
+	 * Enable virtualization features granting guest direct control of
+	 * certain features:
+	 * CP0=1:	Guest coprocessor 0 context.
+	 * AT=Guest:	Guest MMU.
+	 * CG=1:	Hit (virtual address) CACHE operations (optional).
+	 * CF=1:	Guest Config registers.
+	 * CGI=1:	Indexed flush CACHE operations (optional).
+	 */
+	write_c0_guestctl0(MIPS_GCTL0_CP0 |
+			   (MIPS_GCTL0_AT_GUEST << MIPS_GCTL0_AT_SHIFT) |
+			   MIPS_GCTL0_CG | MIPS_GCTL0_CF);
+	if (cpu_has_guestctl0ext)
+		set_c0_guestctl0ext(MIPS_GCTL0EXT_CGI);
+
+	if (cpu_has_guestid) {
+		write_c0_guestctl1(0);
+		kvm_vz_local_flush_roottlb_all_guests();
+
+		GUESTID_MASK = current_cpu_data.guestid_mask;
+		GUESTID_FIRST_VERSION = GUESTID_MASK + 1;
+		GUESTID_VERSION_MASK = ~GUESTID_MASK;
+
+		current_cpu_data.guestid_cache = GUESTID_FIRST_VERSION;
+	}
+
+	/* clear any pending injected virtual guest interrupts */
+	if (cpu_has_guestctl2)
+		clear_c0_guestctl2(0x3f << 10);
+
+	return 0;
+}
+
+static void kvm_vz_hardware_disable(void)
+{
+	kvm_vz_local_flush_guesttlb_all();
+
+	if (cpu_has_guestid) {
+		write_c0_guestctl1(0);
+		kvm_vz_local_flush_roottlb_all_guests();
+	}
+}
+
+static int kvm_vz_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_MIPS_VZ:
+		/* we wouldn't be here unless cpu_has_vz */
+		r = 1;
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_CAP_MIPS_64BIT:
+		/* We support 64-bit registers/operations and addresses */
+		r = 2;
+		break;
+#endif
+	default:
+		r = 0;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_vz_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		vcpu->arch.vzguestid[i] = 0;
+
+	return 0;
+}
+
+static void kvm_vz_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+
+	/*
+	 * If the VCPU is freed and reused as another VCPU, we don't want the
+	 * matching pointer wrongly hanging around in last_vcpu[] or
+	 * last_exec_vcpu[].
+	 */
+	for_each_possible_cpu(cpu) {
+		if (last_vcpu[cpu] == vcpu)
+			last_vcpu[cpu] = NULL;
+		if (last_exec_vcpu[cpu] == vcpu)
+			last_exec_vcpu[cpu] = NULL;
+	}
+}
+
+static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned long count_hz = 100*1000*1000; /* default to 100 MHz */
+
+	/*
+	 * Start off the timer at the same frequency as the host timer, but the
+	 * soft timer doesn't handle frequencies greater than 1GHz yet.
+	 */
+	if (mips_hpt_frequency && mips_hpt_frequency <= NSEC_PER_SEC)
+		count_hz = mips_hpt_frequency;
+	kvm_mips_init_count(vcpu, count_hz);
+
+	/*
+	 * Initialize guest register state to valid architectural reset state.
+	 */
+
+	/* PageGrain */
+	if (cpu_has_mips_r6)
+		kvm_write_sw_gc0_pagegrain(cop0, PG_RIE | PG_XIE | PG_IEC);
+	/* Wired */
+	if (cpu_has_mips_r6)
+		kvm_write_sw_gc0_wired(cop0,
+				       read_gc0_wired() & MIPSR6_WIRED_LIMIT);
+	/* Status */
+	kvm_write_sw_gc0_status(cop0, ST0_BEV | ST0_ERL);
+	if (cpu_has_mips_r6)
+		kvm_change_sw_gc0_status(cop0, ST0_FR, read_gc0_status());
+	/* IntCtl */
+	kvm_write_sw_gc0_intctl(cop0, read_gc0_intctl() &
+				(INTCTLF_IPFDC | INTCTLF_IPPCI | INTCTLF_IPTI));
+	/* PRId */
+	kvm_write_sw_gc0_prid(cop0, boot_cpu_data.processor_id);
+	/* EBase */
+	kvm_write_sw_gc0_ebase(cop0, (s32)0x80000000 | vcpu->vcpu_id);
+	/* Config */
+	kvm_save_gc0_config(cop0);
+	/* architecturally writable (e.g. from guest) */
+	kvm_change_sw_gc0_config(cop0, CONF_CM_CMASK,
+				 _page_cachable_default >> _CACHE_SHIFT);
+	/* architecturally read only, but maybe writable from root */
+	kvm_change_sw_gc0_config(cop0, MIPS_CONF_MT, read_c0_config());
+	if (cpu_guest_has_conf1) {
+		kvm_set_sw_gc0_config(cop0, MIPS_CONF_M);
+		/* Config1 */
+		kvm_save_gc0_config1(cop0);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config1(cop0, MIPS_CONF1_C2	|
+					       MIPS_CONF1_MD	|
+					       MIPS_CONF1_PC	|
+					       MIPS_CONF1_WR	|
+					       MIPS_CONF1_CA	|
+					       MIPS_CONF1_FP);
+	}
+	if (cpu_guest_has_conf2) {
+		kvm_set_sw_gc0_config1(cop0, MIPS_CONF_M);
+		/* Config2 */
+		kvm_save_gc0_config2(cop0);
+	}
+	if (cpu_guest_has_conf3) {
+		kvm_set_sw_gc0_config2(cop0, MIPS_CONF_M);
+		/* Config3 */
+		kvm_save_gc0_config3(cop0);
+		/* architecturally writable (e.g. from guest) */
+		kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_ISA_OE);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_MSA	|
+					       MIPS_CONF3_BPG	|
+					       MIPS_CONF3_ULRI	|
+					       MIPS_CONF3_DSP	|
+					       MIPS_CONF3_CTXTC	|
+					       MIPS_CONF3_ITL	|
+					       MIPS_CONF3_LPA	|
+					       MIPS_CONF3_VEIC	|
+					       MIPS_CONF3_VINT	|
+					       MIPS_CONF3_SP	|
+					       MIPS_CONF3_CDMM	|
+					       MIPS_CONF3_MT	|
+					       MIPS_CONF3_SM	|
+					       MIPS_CONF3_TL);
+	}
+	if (cpu_guest_has_conf4) {
+		kvm_set_sw_gc0_config3(cop0, MIPS_CONF_M);
+		/* Config4 */
+		kvm_save_gc0_config4(cop0);
+	}
+	if (cpu_guest_has_conf5) {
+		kvm_set_sw_gc0_config4(cop0, MIPS_CONF_M);
+		/* Config5 */
+		kvm_save_gc0_config5(cop0);
+		/* architecturally writable (e.g. from guest) */
+		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_K	|
+					       MIPS_CONF5_CV	|
+					       MIPS_CONF5_MSAEN	|
+					       MIPS_CONF5_UFE	|
+					       MIPS_CONF5_FRE	|
+					       MIPS_CONF5_SBRI	|
+					       MIPS_CONF5_UFR);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_MRP);
+	}
+
+	/* start with no pending virtual guest interrupts */
+	if (cpu_has_guestctl2)
+		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;
+
+	/* Put PC at reset vector */
+	vcpu->arch.pc = CKSEG1ADDR(0x1fc00000);
+
+	return 0;
+}
+
+static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+{
+	if (cpu_has_guestid) {
+		/* Flush GuestID for each VCPU individually */
+		kvm_flush_remote_tlbs(kvm);
+	} else {
+		/*
+		 * For each CPU there is a single GPA ASID used by all VCPUs in
+		 * the VM, so it doesn't make sense for the VCPUs to handle
+		 * invalidation of these ASIDs individually.
+		 *
+		 * Instead mark all CPUs as needing ASID invalidation in
+		 * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+		 * kick any running VCPUs so they check asid_flush_mask.
+		 */
+		cpumask_setall(&kvm->arch.asid_flush_mask);
+		kvm_flush_remote_tlbs(kvm);
+	}
+}
+
+static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
+					const struct kvm_memory_slot *slot)
+{
+	kvm_vz_flush_shadow_all(kvm);
+}
+
+static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+	int preserve_guest_tlb;
+
+	preserve_guest_tlb = kvm_vz_check_requests(vcpu, cpu);
+
+	if (preserve_guest_tlb)
+		kvm_vz_vcpu_save_wired(vcpu);
+
+	kvm_vz_vcpu_load_tlb(vcpu, cpu);
+
+	if (preserve_guest_tlb)
+		kvm_vz_vcpu_load_wired(vcpu);
+}
+
+static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+	int r;
+
+	/* Check if we have any exceptions/interrupts pending */
+	kvm_mips_deliver_interrupts(vcpu, read_gc0_cause());
+
+	kvm_vz_check_requests(vcpu, cpu);
+	kvm_vz_vcpu_load_tlb(vcpu, cpu);
+	kvm_vz_vcpu_load_wired(vcpu);
+
+	r = vcpu->arch.vcpu_run(run, vcpu);
+
+	kvm_vz_vcpu_save_wired(vcpu);
+
+	return r;
+}
+
+static struct kvm_mips_callbacks kvm_vz_callbacks = {
+	.handle_cop_unusable = kvm_trap_vz_handle_cop_unusable,
+	.handle_tlb_mod = kvm_trap_vz_handle_tlb_st_miss,
+	.handle_tlb_ld_miss = kvm_trap_vz_handle_tlb_ld_miss,
+	.handle_tlb_st_miss = kvm_trap_vz_handle_tlb_st_miss,
+	.handle_addr_err_st = kvm_trap_vz_no_handler,
+	.handle_addr_err_ld = kvm_trap_vz_no_handler,
+	.handle_syscall = kvm_trap_vz_no_handler,
+	.handle_res_inst = kvm_trap_vz_no_handler,
+	.handle_break = kvm_trap_vz_no_handler,
+	.handle_msa_disabled = kvm_trap_vz_handle_msa_disabled,
+	.handle_guest_exit = kvm_trap_vz_handle_guest_exit,
+
+	.hardware_enable = kvm_vz_hardware_enable,
+	.hardware_disable = kvm_vz_hardware_disable,
+	.check_extension = kvm_vz_check_extension,
+	.vcpu_init = kvm_vz_vcpu_init,
+	.vcpu_uninit = kvm_vz_vcpu_uninit,
+	.vcpu_setup = kvm_vz_vcpu_setup,
+	.flush_shadow_all = kvm_vz_flush_shadow_all,
+	.flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+	.gva_to_gpa = kvm_vz_gva_to_gpa_cb,
+	.queue_timer_int = kvm_vz_queue_timer_int_cb,
+	.dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
+	.queue_io_int = kvm_vz_queue_io_int_cb,
+	.dequeue_io_int = kvm_vz_dequeue_io_int_cb,
+	.irq_deliver = kvm_vz_irq_deliver_cb,
+	.irq_clear = kvm_vz_irq_clear_cb,
+	.num_regs = kvm_vz_num_regs,
+	.copy_reg_indices = kvm_vz_copy_reg_indices,
+	.get_one_reg = kvm_vz_get_one_reg,
+	.set_one_reg = kvm_vz_set_one_reg,
+	.vcpu_load = kvm_vz_vcpu_load,
+	.vcpu_put = kvm_vz_vcpu_put,
+	.vcpu_run = kvm_vz_vcpu_run,
+	.vcpu_reenter = kvm_vz_vcpu_reenter,
+};
+
+int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
+{
+	if (!cpu_has_vz)
+		return -ENODEV;
+
+	/*
+	 * VZ requires at least 2 KScratch registers, so it should have been
+	 * possible to allocate pgd_reg.
+	 */
+	if (WARN(pgd_reg == -1,
+		 "pgd_reg not allocated even though cpu_has_vz\n"))
+		return -ENODEV;
+
+	pr_info("Starting KVM with MIPS VZ extensions\n");
+
+	*install_callbacks = &kvm_vz_callbacks;
+	return 0;
+}
-- 
cgit v1.2.3-55-g7522


From edc89260d474fd044e77b7039bd2a697dddcac6c Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:33 +0000
Subject: KVM: MIPS/VZ: Support guest CP0_BadInstr[P]

Add support for VZ guest CP0_BadInstr and CP0_BadInstrP registers, as
found on most VZ capable cores. These guest registers need context
switching, and exposing via the KVM ioctl API when they are present.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  2 ++
 arch/mips/include/asm/kvm_host.h  |  4 ++++
 arch/mips/kvm/vz.c                | 46 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d2827864827f..5ef4fa1de7d4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2079,6 +2079,8 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
+  MIPS  | KVM_REG_MIPS_CP0_BADINSTR     | 32
+  MIPS  | KVM_REG_MIPS_CP0_BADINSTRP    | 32
   MIPS  | KVM_REG_MIPS_CP0_COUNT        | 32
   MIPS  | KVM_REG_MIPS_CP0_ENTRYHI      | 64
   MIPS  | KVM_REG_MIPS_CP0_COMPARE      | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index bc3f9dedaac8..3a9ca3326315 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -40,6 +40,8 @@
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
 #define KVM_REG_MIPS_CP0_HWRENA		MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR	MIPS_CP0_64(8, 0)
+#define KVM_REG_MIPS_CP0_BADINSTR	MIPS_CP0_32(8, 1)
+#define KVM_REG_MIPS_CP0_BADINSTRP	MIPS_CP0_32(8, 2)
 #define KVM_REG_MIPS_CP0_COUNT		MIPS_CP0_32(9, 0)
 #define KVM_REG_MIPS_CP0_ENTRYHI	MIPS_CP0_64(10, 0)
 #define KVM_REG_MIPS_CP0_COMPARE	MIPS_CP0_32(11, 0)
@@ -669,6 +671,8 @@ __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
 __BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
 __BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
+__BUILD_KVM_RW_HW(badinstr,       32, MIPS_CP0_BAD_VADDR,    1)
+__BUILD_KVM_RW_HW(badinstrp,      32, MIPS_CP0_BAD_VADDR,    2)
 __BUILD_KVM_RW_SW(count,          32, MIPS_CP0_COUNT,        0)
 __BUILD_KVM_RW_HW(entryhi,        l,  MIPS_CP0_TLB_HI,       0)
 __BUILD_KVM_RW_HW(compare,        32, MIPS_CP0_COMPARE,      0)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index cfed234be1e3..ec909fcd08ce 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -1221,6 +1221,10 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 	ret = ARRAY_SIZE(kvm_vz_get_one_regs);
 	if (cpu_guest_has_userlocal)
 		++ret;
+	if (cpu_guest_has_badinstr)
+		++ret;
+	if (cpu_guest_has_badinstrp)
+		++ret;
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1242,6 +1246,18 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		++indices;
 	}
+	if (cpu_guest_has_badinstr) {
+		index = KVM_REG_MIPS_CP0_BADINSTR;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	if (cpu_guest_has_badinstrp) {
+		index = KVM_REG_MIPS_CP0_BADINSTRP;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1327,6 +1343,16 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_BADVADDR:
 		*v = (long)read_gc0_badvaddr();
 		break;
+	case KVM_REG_MIPS_CP0_BADINSTR:
+		if (!cpu_guest_has_badinstr)
+			return -EINVAL;
+		*v = read_gc0_badinstr();
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTRP:
+		if (!cpu_guest_has_badinstrp)
+			return -EINVAL;
+		*v = read_gc0_badinstrp();
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		*v = kvm_mips_read_count(vcpu);
 		break;
@@ -1472,6 +1498,16 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_BADVADDR:
 		write_gc0_badvaddr(v);
 		break;
+	case KVM_REG_MIPS_CP0_BADINSTR:
+		if (!cpu_guest_has_badinstr)
+			return -EINVAL;
+		write_gc0_badinstr(v);
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTRP:
+		if (!cpu_guest_has_badinstrp)
+			return -EINVAL;
+		write_gc0_badinstrp(v);
+		break;
 	case KVM_REG_MIPS_CP0_COUNT:
 		kvm_mips_write_count(vcpu, v);
 		break;
@@ -1871,6 +1907,11 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_restore_gc0_kscratch6(cop0);
 	}
 
+	if (cpu_guest_has_badinstr)
+		kvm_restore_gc0_badinstr(cop0);
+	if (cpu_guest_has_badinstrp)
+		kvm_restore_gc0_badinstrp(cop0);
+
 	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
 	if (cpu_has_guestctl2)
 		write_c0_guestctl2(
@@ -1945,6 +1986,11 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 			kvm_save_gc0_kscratch6(cop0);
 	}
 
+	if (cpu_guest_has_badinstr)
+		kvm_save_gc0_badinstr(cop0);
+	if (cpu_guest_has_badinstrp)
+		kvm_save_gc0_badinstrp(cop0);
+
 	kvm_vz_save_timer(vcpu);
 
 	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
-- 
cgit v1.2.3-55-g7522


From dffe042fd8b2cc174966b929b379503af2c97933 Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:34 +0000
Subject: KVM: MIPS/VZ: Support guest CP0_[X]ContextConfig

Add support for VZ guest CP0_ContextConfig and CP0_XContextConfig
(MIPS64 only) registers, as found on P5600 and P6600 cores. These guest
registers need initialising, context switching, and exposing via the KVM
ioctl API when they are present.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  2 ++
 arch/mips/include/asm/kvm_host.h  |  4 +++
 arch/mips/kvm/vz.c                | 62 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 2 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 5ef4fa1de7d4..5f53bfdc0d84 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2073,7 +2073,9 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_ENTRYLO0     | 64
   MIPS  | KVM_REG_MIPS_CP0_ENTRYLO1     | 64
   MIPS  | KVM_REG_MIPS_CP0_CONTEXT      | 64
+  MIPS  | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32
   MIPS  | KVM_REG_MIPS_CP0_USERLOCAL    | 64
+  MIPS  | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
   MIPS  | KVM_REG_MIPS_CP0_PAGEGRAIN    | 32
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 3a9ca3326315..5066d89f2227 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -34,7 +34,9 @@
 #define KVM_REG_MIPS_CP0_ENTRYLO0	MIPS_CP0_64(2, 0)
 #define KVM_REG_MIPS_CP0_ENTRYLO1	MIPS_CP0_64(3, 0)
 #define KVM_REG_MIPS_CP0_CONTEXT	MIPS_CP0_64(4, 0)
+#define KVM_REG_MIPS_CP0_CONTEXTCONFIG	MIPS_CP0_32(4, 1)
 #define KVM_REG_MIPS_CP0_USERLOCAL	MIPS_CP0_64(4, 2)
+#define KVM_REG_MIPS_CP0_XCONTEXTCONFIG	MIPS_CP0_64(4, 3)
 #define KVM_REG_MIPS_CP0_PAGEMASK	MIPS_CP0_32(5, 0)
 #define KVM_REG_MIPS_CP0_PAGEGRAIN	MIPS_CP0_32(5, 1)
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
@@ -665,7 +667,9 @@ __BUILD_KVM_RW_HW(index,          32, MIPS_CP0_TLB_INDEX,    0)
 __BUILD_KVM_RW_HW(entrylo0,       l,  MIPS_CP0_TLB_LO0,      0)
 __BUILD_KVM_RW_HW(entrylo1,       l,  MIPS_CP0_TLB_LO1,      0)
 __BUILD_KVM_RW_HW(context,        l,  MIPS_CP0_TLB_CONTEXT,  0)
+__BUILD_KVM_RW_HW(contextconfig,  32, MIPS_CP0_TLB_CONTEXT,  1)
 __BUILD_KVM_RW_HW(userlocal,      l,  MIPS_CP0_TLB_CONTEXT,  2)
+__BUILD_KVM_RW_HW(xcontextconfig, l,  MIPS_CP0_TLB_CONTEXT,  3)
 __BUILD_KVM_RW_HW(pagemask,       l,  MIPS_CP0_TLB_PG_MASK,  0)
 __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index ec909fcd08ce..97e7a788bf4a 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -131,7 +131,7 @@ static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
  * Config:	M, [MT]
  * Config1:	M, [MMUSize-1, C2, MD, PC, WR, CA], FP
  * Config2:	M
- * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP, CTXTC, ITL, LPA, VEIC,
+ * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP], CTXTC, [ITL, LPA, VEIC,
  *		VInt, SP, CDMM, MT, SM, TL]
  * Config4:	M, [VTLBSizeExt, MMUSizeExt]
  * Config5:	[MRP]
@@ -161,7 +161,7 @@ static inline unsigned int kvm_vz_config2_user_wrmask(struct kvm_vcpu *vcpu)
 static inline unsigned int kvm_vz_config3_user_wrmask(struct kvm_vcpu *vcpu)
 {
 	unsigned int mask = kvm_vz_config3_guest_wrmask(vcpu) | MIPS_CONF_M |
-		MIPS_CONF3_ULRI;
+		MIPS_CONF3_ULRI | MIPS_CONF3_CTXTC;
 
 	/* Permit MSA to be present if MSA is supported */
 	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
@@ -1205,6 +1205,13 @@ static u64 kvm_vz_get_one_regs[] = {
 	KVM_REG_MIPS_COUNT_HZ,
 };
 
+static u64 kvm_vz_get_one_regs_contextconfig[] = {
+	KVM_REG_MIPS_CP0_CONTEXTCONFIG,
+#ifdef CONFIG_64BIT
+	KVM_REG_MIPS_CP0_XCONTEXTCONFIG,
+#endif
+};
+
 static u64 kvm_vz_get_one_regs_kscratch[] = {
 	KVM_REG_MIPS_CP0_KSCRATCH1,
 	KVM_REG_MIPS_CP0_KSCRATCH2,
@@ -1225,6 +1232,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		++ret;
 	if (cpu_guest_has_badinstrp)
 		++ret;
+	if (cpu_guest_has_contextconfig)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1258,6 +1267,12 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		++indices;
 	}
+	if (cpu_guest_has_contextconfig) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_contextconfig,
+				 sizeof(kvm_vz_get_one_regs_contextconfig)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1323,11 +1338,23 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_CONTEXT:
 		*v = (long)read_gc0_context();
 		break;
+	case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		*v = read_gc0_contextconfig();
+		break;
 	case KVM_REG_MIPS_CP0_USERLOCAL:
 		if (!cpu_guest_has_userlocal)
 			return -EINVAL;
 		*v = read_gc0_userlocal();
 		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		*v = read_gc0_xcontextconfig();
+		break;
+#endif
 	case KVM_REG_MIPS_CP0_PAGEMASK:
 		*v = (long)read_gc0_pagemask();
 		break;
@@ -1478,11 +1505,23 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_CONTEXT:
 		write_gc0_context(v);
 		break;
+	case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		write_gc0_contextconfig(v);
+		break;
 	case KVM_REG_MIPS_CP0_USERLOCAL:
 		if (!cpu_guest_has_userlocal)
 			return -EINVAL;
 		write_gc0_userlocal(v);
 		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		write_gc0_xcontextconfig(v);
+		break;
+#endif
 	case KVM_REG_MIPS_CP0_PAGEMASK:
 		write_gc0_pagemask(v);
 		break;
@@ -1874,8 +1913,12 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	kvm_restore_gc0_entrylo0(cop0);
 	kvm_restore_gc0_entrylo1(cop0);
 	kvm_restore_gc0_context(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_restore_gc0_contextconfig(cop0);
 #ifdef CONFIG_64BIT
 	kvm_restore_gc0_xcontext(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_restore_gc0_xcontextconfig(cop0);
 #endif
 	kvm_restore_gc0_pagemask(cop0);
 	kvm_restore_gc0_pagegrain(cop0);
@@ -1933,8 +1976,12 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 	kvm_save_gc0_entrylo0(cop0);
 	kvm_save_gc0_entrylo1(cop0);
 	kvm_save_gc0_context(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_save_gc0_contextconfig(cop0);
 #ifdef CONFIG_64BIT
 	kvm_save_gc0_xcontext(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_save_gc0_xcontextconfig(cop0);
 #endif
 	kvm_save_gc0_pagemask(cop0);
 	kvm_save_gc0_pagegrain(cop0);
@@ -2298,6 +2345,17 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
 		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_MRP);
 	}
 
+	if (cpu_guest_has_contextconfig) {
+		/* ContextConfig */
+		kvm_write_sw_gc0_contextconfig(cop0, 0x007ffff0);
+#ifdef CONFIG_64BIT
+		/* XContextConfig */
+		/* bits SEGBITS-13+3:4 set */
+		kvm_write_sw_gc0_xcontextconfig(cop0,
+					((1ull << (cpu_vmbits - 13)) - 1) << 4);
+#endif
+	}
+
 	/* start with no pending virtual guest interrupts */
 	if (cpu_has_guestctl2)
 		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;
-- 
cgit v1.2.3-55-g7522


From 4b7de028e66cc499ab85b5003af944225c8c87e9 Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:35 +0000
Subject: KVM: MIPS/VZ: Support guest segmentation control

Add support for VZ guest CP0_SegCtl0, CP0_SegCtl1, and CP0_SegCtl2
registers, as found on P5600 and P6600 cores. These guest registers need
initialising, context switching, and exposing via the KVM ioctl API when
they are present.

They also require the GVA -> GPA translation code for handling a GVA
root exception to be updated to interpret the segmentation registers and
decode the faulting instruction enough to detect EVA memory access
instructions.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |   3 +
 arch/mips/include/asm/kvm_host.h  |   6 +
 arch/mips/kvm/vz.c                | 242 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 250 insertions(+), 1 deletion(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 5f53bfdc0d84..45194363a160 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2078,6 +2078,9 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
   MIPS  | KVM_REG_MIPS_CP0_PAGEGRAIN    | 32
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL0      | 64
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL1      | 64
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL2      | 64
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5066d89f2227..b2129c031df7 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -39,6 +39,9 @@
 #define KVM_REG_MIPS_CP0_XCONTEXTCONFIG	MIPS_CP0_64(4, 3)
 #define KVM_REG_MIPS_CP0_PAGEMASK	MIPS_CP0_32(5, 0)
 #define KVM_REG_MIPS_CP0_PAGEGRAIN	MIPS_CP0_32(5, 1)
+#define KVM_REG_MIPS_CP0_SEGCTL0	MIPS_CP0_64(5, 2)
+#define KVM_REG_MIPS_CP0_SEGCTL1	MIPS_CP0_64(5, 3)
+#define KVM_REG_MIPS_CP0_SEGCTL2	MIPS_CP0_64(5, 4)
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
 #define KVM_REG_MIPS_CP0_HWRENA		MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR	MIPS_CP0_64(8, 0)
@@ -672,6 +675,9 @@ __BUILD_KVM_RW_HW(userlocal,      l,  MIPS_CP0_TLB_CONTEXT,  2)
 __BUILD_KVM_RW_HW(xcontextconfig, l,  MIPS_CP0_TLB_CONTEXT,  3)
 __BUILD_KVM_RW_HW(pagemask,       l,  MIPS_CP0_TLB_PG_MASK,  0)
 __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
+__BUILD_KVM_RW_HW(segctl0,        l,  MIPS_CP0_TLB_PG_MASK,  2)
+__BUILD_KVM_RW_HW(segctl1,        l,  MIPS_CP0_TLB_PG_MASK,  3)
+__BUILD_KVM_RW_HW(segctl2,        l,  MIPS_CP0_TLB_PG_MASK,  4)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
 __BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
 __BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 97e7a788bf4a..f32c1ab3f724 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -411,6 +411,117 @@ static void kvm_vz_save_timer(struct kvm_vcpu *vcpu)
 	kvm_write_sw_gc0_compare(cop0, compare);
 }
 
+/**
+ * is_eva_access() - Find whether an instruction is an EVA memory accessor.
+ * @inst:	32-bit instruction encoding.
+ *
+ * Finds whether @inst encodes an EVA memory access instruction, which would
+ * indicate that emulation of it should access the user mode address space
+ * instead of the kernel mode address space. This matters for MUSUK segments
+ * which are TLB mapped for user mode but unmapped for kernel mode.
+ *
+ * Returns:	Whether @inst encodes an EVA accessor instruction.
+ */
+static bool is_eva_access(union mips_instruction inst)
+{
+	if (inst.spec3_format.opcode != spec3_op)
+		return false;
+
+	switch (inst.spec3_format.func) {
+	case lwle_op:
+	case lwre_op:
+	case cachee_op:
+	case sbe_op:
+	case she_op:
+	case sce_op:
+	case swe_op:
+	case swle_op:
+	case swre_op:
+	case prefe_op:
+	case lbue_op:
+	case lhue_op:
+	case lbe_op:
+	case lhe_op:
+	case lle_op:
+	case lwe_op:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * is_eva_am_mapped() - Find whether an access mode is mapped.
+ * @vcpu:	KVM VCPU state.
+ * @am:		3-bit encoded access mode.
+ * @eu:		Segment becomes unmapped and uncached when Status.ERL=1.
+ *
+ * Decode @am to find whether it encodes a mapped segment for the current VCPU
+ * state. Where necessary @eu and the actual instruction causing the fault are
+ * taken into account to make the decision.
+ *
+ * Returns:	Whether the VCPU faulted on a TLB mapped address.
+ */
+static bool is_eva_am_mapped(struct kvm_vcpu *vcpu, unsigned int am, bool eu)
+{
+	u32 am_lookup;
+	int err;
+
+	/*
+	 * Interpret access control mode. We assume address errors will already
+	 * have been caught by the guest, leaving us with:
+	 *      AM      UM  SM  KM  31..24 23..16
+	 * UK    0 000          Unm   0      0
+	 * MK    1 001          TLB   1
+	 * MSK   2 010      TLB TLB   1
+	 * MUSK  3 011  TLB TLB TLB   1
+	 * MUSUK 4 100  TLB TLB Unm   0      1
+	 * USK   5 101      Unm Unm   0      0
+	 * -     6 110                0      0
+	 * UUSK  7 111  Unm Unm Unm   0      0
+	 *
+	 * We shift a magic value by AM across the sign bit to find if always
+	 * TLB mapped, and if not shift by 8 again to find if it depends on KM.
+	 */
+	am_lookup = 0x70080000 << am;
+	if ((s32)am_lookup < 0) {
+		/*
+		 * MK, MSK, MUSK
+		 * Always TLB mapped, unless SegCtl.EU && ERL
+		 */
+		if (!eu || !(read_gc0_status() & ST0_ERL))
+			return true;
+	} else {
+		am_lookup <<= 8;
+		if ((s32)am_lookup < 0) {
+			union mips_instruction inst;
+			unsigned int status;
+			u32 *opc;
+
+			/*
+			 * MUSUK
+			 * TLB mapped if not in kernel mode
+			 */
+			status = read_gc0_status();
+			if (!(status & (ST0_EXL | ST0_ERL)) &&
+			    (status & ST0_KSU))
+				return true;
+			/*
+			 * EVA access instructions in kernel
+			 * mode access user address space.
+			 */
+			opc = (u32 *)vcpu->arch.pc;
+			if (vcpu->arch.host_cp0_cause & CAUSEF_BD)
+				opc += 1;
+			err = kvm_get_badinstr(opc, vcpu, &inst.word);
+			if (!err && is_eva_access(inst))
+				return true;
+		}
+	}
+
+	return false;
+}
+
 /**
  * kvm_vz_gva_to_gpa() - Convert valid GVA to GPA.
  * @vcpu:	KVM VCPU state.
@@ -427,10 +538,58 @@ static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 			     unsigned long *gpa)
 {
 	u32 gva32 = gva;
+	unsigned long segctl;
 
 	if ((long)gva == (s32)gva32) {
 		/* Handle canonical 32-bit virtual address */
-		if ((s32)gva32 < (s32)0xc0000000) {
+		if (cpu_guest_has_segments) {
+			unsigned long mask, pa;
+
+			switch (gva32 >> 29) {
+			case 0:
+			case 1: /* CFG5 (1GB) */
+				segctl = read_gc0_segctl2() >> 16;
+				mask = (unsigned long)0xfc0000000ull;
+				break;
+			case 2:
+			case 3: /* CFG4 (1GB) */
+				segctl = read_gc0_segctl2();
+				mask = (unsigned long)0xfc0000000ull;
+				break;
+			case 4: /* CFG3 (512MB) */
+				segctl = read_gc0_segctl1() >> 16;
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 5: /* CFG2 (512MB) */
+				segctl = read_gc0_segctl1();
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 6: /* CFG1 (512MB) */
+				segctl = read_gc0_segctl0() >> 16;
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 7: /* CFG0 (512MB) */
+				segctl = read_gc0_segctl0();
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			default:
+				/*
+				 * GCC 4.9 isn't smart enough to figure out that
+				 * segctl and mask are always initialised.
+				 */
+				unreachable();
+			}
+
+			if (is_eva_am_mapped(vcpu, (segctl >> 4) & 0x7,
+					     segctl & 0x0008))
+				goto tlb_mapped;
+
+			/* Unmapped, find guest physical address */
+			pa = (segctl << 20) & mask;
+			pa |= gva32 & ~mask;
+			*gpa = pa;
+			return 0;
+		} else if ((s32)gva32 < (s32)0xc0000000) {
 			/* legacy unmapped KSeg0 or KSeg1 */
 			*gpa = gva32 & 0x1fffffff;
 			return 0;
@@ -438,6 +597,20 @@ static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 #ifdef CONFIG_64BIT
 	} else if ((gva & 0xc000000000000000) == 0x8000000000000000) {
 		/* XKPHYS */
+		if (cpu_guest_has_segments) {
+			/*
+			 * Each of the 8 regions can be overridden by SegCtl2.XR
+			 * to use SegCtl1.XAM.
+			 */
+			segctl = read_gc0_segctl2();
+			if (segctl & (1ull << (56 + ((gva >> 59) & 0x7)))) {
+				segctl = read_gc0_segctl1();
+				if (is_eva_am_mapped(vcpu, (segctl >> 59) & 0x7,
+						     0))
+					goto tlb_mapped;
+			}
+
+		}
 		/*
 		 * Traditionally fully unmapped.
 		 * Bits 61:59 specify the CCA, which we can just mask off here.
@@ -449,6 +622,7 @@ static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 #endif
 	}
 
+tlb_mapped:
 	return kvm_vz_guest_tlb_lookup(vcpu, gva, gpa);
 }
 
@@ -1212,6 +1386,12 @@ static u64 kvm_vz_get_one_regs_contextconfig[] = {
 #endif
 };
 
+static u64 kvm_vz_get_one_regs_segments[] = {
+	KVM_REG_MIPS_CP0_SEGCTL0,
+	KVM_REG_MIPS_CP0_SEGCTL1,
+	KVM_REG_MIPS_CP0_SEGCTL2,
+};
+
 static u64 kvm_vz_get_one_regs_kscratch[] = {
 	KVM_REG_MIPS_CP0_KSCRATCH1,
 	KVM_REG_MIPS_CP0_KSCRATCH2,
@@ -1234,6 +1414,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		++ret;
 	if (cpu_guest_has_contextconfig)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+	if (cpu_guest_has_segments)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1273,6 +1455,12 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		indices += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
 	}
+	if (cpu_guest_has_segments) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_segments,
+				 sizeof(kvm_vz_get_one_regs_segments)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1361,6 +1549,21 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_PAGEGRAIN:
 		*v = (long)read_gc0_pagegrain();
 		break;
+	case KVM_REG_MIPS_CP0_SEGCTL0:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl0();
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL1:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl1();
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL2:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl2();
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		*v = (long)read_gc0_wired();
 		break;
@@ -1528,6 +1731,21 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 	case KVM_REG_MIPS_CP0_PAGEGRAIN:
 		write_gc0_pagegrain(v);
 		break;
+	case KVM_REG_MIPS_CP0_SEGCTL0:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl0(v);
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL1:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl1(v);
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL2:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl2(v);
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		change_gc0_wired(MIPSR6_WIRED_WIRED, v);
 		break;
@@ -1955,6 +2173,12 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (cpu_guest_has_badinstrp)
 		kvm_restore_gc0_badinstrp(cop0);
 
+	if (cpu_guest_has_segments) {
+		kvm_restore_gc0_segctl0(cop0);
+		kvm_restore_gc0_segctl1(cop0);
+		kvm_restore_gc0_segctl2(cop0);
+	}
+
 	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
 	if (cpu_has_guestctl2)
 		write_c0_guestctl2(
@@ -2038,6 +2262,12 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 	if (cpu_guest_has_badinstrp)
 		kvm_save_gc0_badinstrp(cop0);
 
+	if (cpu_guest_has_segments) {
+		kvm_save_gc0_segctl0(cop0);
+		kvm_save_gc0_segctl1(cop0);
+		kvm_save_gc0_segctl2(cop0);
+	}
+
 	kvm_vz_save_timer(vcpu);
 
 	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
@@ -2356,6 +2586,16 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
 #endif
 	}
 
+	/* Implementation dependent, use the legacy layout */
+	if (cpu_guest_has_segments) {
+		/* SegCtl0, SegCtl1, SegCtl2 */
+		kvm_write_sw_gc0_segctl0(cop0, 0x00200010);
+		kvm_write_sw_gc0_segctl1(cop0, 0x00000002 |
+				(_page_cachable_default >> _CACHE_SHIFT) <<
+						(16 + MIPS_SEGCFG_C_SHIFT));
+		kvm_write_sw_gc0_segctl2(cop0, 0x00380438);
+	}
+
 	/* start with no pending virtual guest interrupts */
 	if (cpu_has_guestctl2)
 		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;
-- 
cgit v1.2.3-55-g7522


From 5a2f352f343ac9b4e869ff29b3a0a833c870303b Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:36 +0000
Subject: KVM: MIPS/VZ: Support guest hardware page table walker

Add support for VZ guest CP0_PWBase, CP0_PWField, CP0_PWSize, and
CP0_PWCtl registers for controlling the guest hardware page table walker
(HTW) present on P5600 and P6600 cores. These guest registers need
initialising on R6, context switching, and exposing via the KVM ioctl
API when they are present.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |  4 ++
 arch/mips/include/asm/kvm_host.h  |  8 ++++
 arch/mips/kvm/vz.c                | 80 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 45194363a160..b108238dc9dc 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2081,7 +2081,11 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_SEGCTL0      | 64
   MIPS  | KVM_REG_MIPS_CP0_SEGCTL1      | 64
   MIPS  | KVM_REG_MIPS_CP0_SEGCTL2      | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWBASE       | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWFIELD      | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWSIZE       | 64
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
+  MIPS  | KVM_REG_MIPS_CP0_PWCTL        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
   MIPS  | KVM_REG_MIPS_CP0_BADINSTR     | 32
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b2129c031df7..8d016ab3a8b9 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -42,7 +42,11 @@
 #define KVM_REG_MIPS_CP0_SEGCTL0	MIPS_CP0_64(5, 2)
 #define KVM_REG_MIPS_CP0_SEGCTL1	MIPS_CP0_64(5, 3)
 #define KVM_REG_MIPS_CP0_SEGCTL2	MIPS_CP0_64(5, 4)
+#define KVM_REG_MIPS_CP0_PWBASE		MIPS_CP0_64(5, 5)
+#define KVM_REG_MIPS_CP0_PWFIELD	MIPS_CP0_64(5, 6)
+#define KVM_REG_MIPS_CP0_PWSIZE		MIPS_CP0_64(5, 7)
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
+#define KVM_REG_MIPS_CP0_PWCTL		MIPS_CP0_32(6, 6)
 #define KVM_REG_MIPS_CP0_HWRENA		MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR	MIPS_CP0_64(8, 0)
 #define KVM_REG_MIPS_CP0_BADINSTR	MIPS_CP0_32(8, 1)
@@ -678,7 +682,11 @@ __BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
 __BUILD_KVM_RW_HW(segctl0,        l,  MIPS_CP0_TLB_PG_MASK,  2)
 __BUILD_KVM_RW_HW(segctl1,        l,  MIPS_CP0_TLB_PG_MASK,  3)
 __BUILD_KVM_RW_HW(segctl2,        l,  MIPS_CP0_TLB_PG_MASK,  4)
+__BUILD_KVM_RW_HW(pwbase,         l,  MIPS_CP0_TLB_PG_MASK,  5)
+__BUILD_KVM_RW_HW(pwfield,        l,  MIPS_CP0_TLB_PG_MASK,  6)
+__BUILD_KVM_RW_HW(pwsize,         l,  MIPS_CP0_TLB_PG_MASK,  7)
 __BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
+__BUILD_KVM_RW_HW(pwctl,          32, MIPS_CP0_TLB_WIRED,    6)
 __BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
 __BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
 __BUILD_KVM_RW_HW(badinstr,       32, MIPS_CP0_BAD_VADDR,    1)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index f32c1ab3f724..fb12c5b4a75c 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -1392,6 +1392,13 @@ static u64 kvm_vz_get_one_regs_segments[] = {
 	KVM_REG_MIPS_CP0_SEGCTL2,
 };
 
+static u64 kvm_vz_get_one_regs_htw[] = {
+	KVM_REG_MIPS_CP0_PWBASE,
+	KVM_REG_MIPS_CP0_PWFIELD,
+	KVM_REG_MIPS_CP0_PWSIZE,
+	KVM_REG_MIPS_CP0_PWCTL,
+};
+
 static u64 kvm_vz_get_one_regs_kscratch[] = {
 	KVM_REG_MIPS_CP0_KSCRATCH1,
 	KVM_REG_MIPS_CP0_KSCRATCH2,
@@ -1416,6 +1423,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
 	if (cpu_guest_has_segments)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+	if (cpu_guest_has_htw)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1461,6 +1470,12 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		indices += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
 	}
+	if (cpu_guest_has_htw) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_htw,
+				 sizeof(kvm_vz_get_one_regs_htw)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1564,9 +1579,29 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		*v = read_gc0_segctl2();
 		break;
+	case KVM_REG_MIPS_CP0_PWBASE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwbase();
+		break;
+	case KVM_REG_MIPS_CP0_PWFIELD:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwfield();
+		break;
+	case KVM_REG_MIPS_CP0_PWSIZE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwsize();
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		*v = (long)read_gc0_wired();
 		break;
+	case KVM_REG_MIPS_CP0_PWCTL:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwctl();
+		break;
 	case KVM_REG_MIPS_CP0_HWRENA:
 		*v = (long)read_gc0_hwrena();
 		break;
@@ -1746,9 +1781,29 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		write_gc0_segctl2(v);
 		break;
+	case KVM_REG_MIPS_CP0_PWBASE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwbase(v);
+		break;
+	case KVM_REG_MIPS_CP0_PWFIELD:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwfield(v);
+		break;
+	case KVM_REG_MIPS_CP0_PWSIZE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwsize(v);
+		break;
 	case KVM_REG_MIPS_CP0_WIRED:
 		change_gc0_wired(MIPSR6_WIRED_WIRED, v);
 		break;
+	case KVM_REG_MIPS_CP0_PWCTL:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwctl(v);
+		break;
 	case KVM_REG_MIPS_CP0_HWRENA:
 		write_gc0_hwrena(v);
 		break;
@@ -2179,6 +2234,14 @@ static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_restore_gc0_segctl2(cop0);
 	}
 
+	/* restore HTW registers */
+	if (cpu_guest_has_htw) {
+		kvm_restore_gc0_pwbase(cop0);
+		kvm_restore_gc0_pwfield(cop0);
+		kvm_restore_gc0_pwsize(cop0);
+		kvm_restore_gc0_pwctl(cop0);
+	}
+
 	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
 	if (cpu_has_guestctl2)
 		write_c0_guestctl2(
@@ -2268,6 +2331,15 @@ static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
 		kvm_save_gc0_segctl2(cop0);
 	}
 
+	/* save HTW registers if enabled in guest */
+	if (cpu_guest_has_htw &&
+	    kvm_read_sw_gc0_config3(cop0) & MIPS_CONF3_PW) {
+		kvm_save_gc0_pwbase(cop0);
+		kvm_save_gc0_pwfield(cop0);
+		kvm_save_gc0_pwsize(cop0);
+		kvm_save_gc0_pwctl(cop0);
+	}
+
 	kvm_vz_save_timer(vcpu);
 
 	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
@@ -2596,6 +2668,14 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
 		kvm_write_sw_gc0_segctl2(cop0, 0x00380438);
 	}
 
+	/* reset HTW registers */
+	if (cpu_guest_has_htw && cpu_has_mips_r6) {
+		/* PWField */
+		kvm_write_sw_gc0_pwfield(cop0, 0x0c30c302);
+		/* PWSize */
+		kvm_write_sw_gc0_pwsize(cop0, 1 << MIPS_PWSIZE_PTW_SHIFT);
+	}
+
 	/* start with no pending virtual guest interrupts */
 	if (cpu_has_guestctl2)
 		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;
-- 
cgit v1.2.3-55-g7522


From d42a008f86ba3d715d31788fc5143a4de5685d33 Mon Sep 17 00:00:00 2001
From: James Hogan
Date: Tue, 14 Mar 2017 10:15:38 +0000
Subject: KVM: MIPS/VZ: Emulate MAARs when necessary

Add emulation of Memory Accessibility Attribute Registers (MAARs) when
necessary. We can't actually do anything with whatever the guest
provides, but it may not be possible to clear Guest.Config5.MRP so we
have to emulate at least a pair of MAARs.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: linux-doc@vger.kernel.org
---
 Documentation/virtual/kvm/api.txt |   5 ++
 arch/mips/include/asm/kvm_host.h  |   5 ++
 arch/mips/include/uapi/asm/kvm.h  |  20 +++++--
 arch/mips/kvm/trace.h             |   2 +
 arch/mips/kvm/vz.c                | 110 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 137 insertions(+), 5 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b108238dc9dc..e601c8f01fd9 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2114,6 +2114,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH4    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH5    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH6    | 64
+  MIPS  | KVM_REG_MIPS_CP0_MAAR(0..63)  | 64
   MIPS  | KVM_REG_MIPS_COUNT_CTL        | 64
   MIPS  | KVM_REG_MIPS_COUNT_RESUME     | 64
   MIPS  | KVM_REG_MIPS_COUNT_HZ         | 64
@@ -2180,6 +2181,10 @@ hardware, host kernel, guest, and whether XPA is present in the guest, i.e.
 with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and
 the PFNX field starting at bit 30.
 
+MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit
+patterns:
+  0x7030 0000 0001 01 <reg:8>
+
 MIPS KVM control registers (see above) have the following id bit patterns:
   0x7030 0000 0002 <reg:16>
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 8d016ab3a8b9..a662a80152b1 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -67,6 +67,7 @@
 #define KVM_REG_MIPS_CP0_CONFIG4	MIPS_CP0_32(16, 4)
 #define KVM_REG_MIPS_CP0_CONFIG5	MIPS_CP0_32(16, 5)
 #define KVM_REG_MIPS_CP0_CONFIG7	MIPS_CP0_32(16, 7)
+#define KVM_REG_MIPS_CP0_MAARI		MIPS_CP0_64(17, 2)
 #define KVM_REG_MIPS_CP0_XCONTEXT	MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC	MIPS_CP0_64(30, 0)
 #define KVM_REG_MIPS_CP0_KSCRATCH1	MIPS_CP0_64(31, 2)
@@ -388,6 +389,9 @@ struct kvm_vcpu_arch {
 	struct kvm_mips_tlb *wired_tlb;
 	unsigned int wired_tlb_limit;
 	unsigned int wired_tlb_used;
+
+	/* emulated guest MAAR registers */
+	unsigned long maar[6];
 #endif
 
 	/* Last CPU the VCPU state was loaded on */
@@ -708,6 +712,7 @@ __BUILD_KVM_RW_HW(config4,        32, MIPS_CP0_CONFIG,       4)
 __BUILD_KVM_RW_HW(config5,        32, MIPS_CP0_CONFIG,       5)
 __BUILD_KVM_RW_HW(config6,        32, MIPS_CP0_CONFIG,       6)
 __BUILD_KVM_RW_HW(config7,        32, MIPS_CP0_CONFIG,       7)
+__BUILD_KVM_RW_SW(maari,          l,  MIPS_CP0_LLADDR,       2)
 __BUILD_KVM_RW_HW(xcontext,       l,  MIPS_CP0_TLB_XCONTEXT, 0)
 __BUILD_KVM_RW_HW(errorepc,       l,  MIPS_CP0_ERROR_PC,     0)
 __BUILD_KVM_RW_HW(kscratch1,      l,  MIPS_CP0_DESAVE,       2)
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index a8a0199bf760..3107095d7f0a 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -54,9 +54,14 @@ struct kvm_fpu {
  * Register set = 0: GP registers from kvm_regs (see definitions below).
  *
  * Register set = 1: CP0 registers.
- *  bits[15..8]  - Must be zero.
- *  bits[7..3]   - Register 'rd'  index.
- *  bits[2..0]   - Register 'sel' index.
+ *  bits[15..8]  - COP0 register set.
+ *
+ *  COP0 register set = 0: Main CP0 registers.
+ *   bits[7..3]   - Register 'rd'  index.
+ *   bits[2..0]   - Register 'sel' index.
+ *
+ *  COP0 register set = 1: MAARs.
+ *   bits[7..0]   - MAAR index.
  *
  * Register set = 2: KVM specific registers (see definitions below).
  *
@@ -114,6 +119,15 @@ struct kvm_fpu {
 #define KVM_REG_MIPS_PC		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 34)
 
 
+/*
+ * KVM_REG_MIPS_CP0 - Coprocessor 0 registers.
+ */
+
+#define KVM_REG_MIPS_MAAR	(KVM_REG_MIPS_CP0 | (1 << 8))
+#define KVM_REG_MIPS_CP0_MAAR(n)	(KVM_REG_MIPS_MAAR | \
+					 KVM_REG_SIZE_U64 | (n))
+
+
 /*
  * KVM_REG_MIPS_KVM - KVM specific control registers.
  */
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index d80d37a1b82e..affde8a2c584 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -176,6 +176,8 @@ TRACE_EVENT(kvm_exit,
 	{ KVM_TRACE_COP0(16, 4),	"Config4" },		\
 	{ KVM_TRACE_COP0(16, 5),	"Config5" },		\
 	{ KVM_TRACE_COP0(16, 7),	"Config7" },		\
+	{ KVM_TRACE_COP0(17, 1),	"MAAR" },		\
+	{ KVM_TRACE_COP0(17, 2),	"MAARI" },		\
 	{ KVM_TRACE_COP0(26, 0),	"ECC" },		\
 	{ KVM_TRACE_COP0(30, 0),	"ErrorEPC" },		\
 	{ KVM_TRACE_COP0(31, 2),	"KScratch1" },		\
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 450f946358ae..cbc6850cff02 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -134,7 +134,7 @@ static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
  * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP], CTXTC, [ITL, LPA, VEIC,
  *		VInt, SP, CDMM, MT, SM, TL]
  * Config4:	M, [VTLBSizeExt, MMUSizeExt]
- * Config5:	[MRP]
+ * Config5:	MRP
  */
 
 static inline unsigned int kvm_vz_config_user_wrmask(struct kvm_vcpu *vcpu)
@@ -177,7 +177,7 @@ static inline unsigned int kvm_vz_config4_user_wrmask(struct kvm_vcpu *vcpu)
 
 static inline unsigned int kvm_vz_config5_user_wrmask(struct kvm_vcpu *vcpu)
 {
-	return kvm_vz_config5_guest_wrmask(vcpu);
+	return kvm_vz_config5_guest_wrmask(vcpu) | MIPS_CONF5_MRP;
 }
 
 static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
@@ -685,6 +685,41 @@ static int kvm_trap_vz_no_handler(struct kvm_vcpu *vcpu)
 	return RESUME_HOST;
 }
 
+static unsigned long mips_process_maar(unsigned int op, unsigned long val)
+{
+	/* Mask off unused bits */
+	unsigned long mask = 0xfffff000 | MIPS_MAAR_S | MIPS_MAAR_VL;
+
+	if (read_gc0_pagegrain() & PG_ELPA)
+		mask |= 0x00ffffff00000000ull;
+	if (cpu_guest_has_mvh)
+		mask |= MIPS_MAAR_VH;
+
+	/* Set or clear VH */
+	if (op == mtc_op) {
+		/* clear VH */
+		val &= ~MIPS_MAAR_VH;
+	} else if (op == dmtc_op) {
+		/* set VH to match VL */
+		val &= ~MIPS_MAAR_VH;
+		if (val & MIPS_MAAR_VL)
+			val |= MIPS_MAAR_VH;
+	}
+
+	return val & mask;
+}
+
+static void kvm_write_maari(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	val &= MIPS_MAARI_INDEX;
+	if (val == MIPS_MAARI_INDEX)
+		kvm_write_sw_gc0_maari(cop0, ARRAY_SIZE(vcpu->arch.maar) - 1);
+	else if (val < ARRAY_SIZE(vcpu->arch.maar))
+		kvm_write_sw_gc0_maari(cop0, val);
+}
+
 static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 					      u32 *opc, u32 cause,
 					      struct kvm_run *run,
@@ -737,6 +772,15 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 						MIPS_LLADDR_LLB;
 				else
 					val = 0;
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 1 &&		/* MAAR */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				/* MAARI must be in range */
+				BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+						ARRAY_SIZE(vcpu->arch.maar));
+				val = vcpu->arch.maar[
+					kvm_read_sw_gc0_maari(cop0)];
 			} else if ((rd == MIPS_CP0_PRID &&
 				    (sel == 0 ||	/* PRid */
 				     sel == 2 ||	/* CDMMBase */
@@ -746,6 +790,10 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 				     sel == 3)) ||	/* SRSMap */
 				   (rd == MIPS_CP0_CONFIG &&
 				    (sel == 7)) ||	/* Config7 */
+				   (rd == MIPS_CP0_LLADDR &&
+				    (sel == 2) &&	/* MAARI */
+				    cpu_guest_has_maar &&
+				    !cpu_guest_has_dyn_maar) ||
 				   (rd == MIPS_CP0_ERRCTL &&
 				    (sel == 0))) {	/* ErrCtl */
 				val = cop0->reg[rd][sel];
@@ -793,6 +841,23 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
 				if (cpu_guest_has_rw_llb &&
 				    !(val & MIPS_LLADDR_LLB))
 					write_gc0_lladdr(0);
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 1 &&		/* MAAR */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				val = mips_process_maar(inst.c0r_format.rs,
+							val);
+
+				/* MAARI must be in range */
+				BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+						ARRAY_SIZE(vcpu->arch.maar));
+				vcpu->arch.maar[kvm_read_sw_gc0_maari(cop0)] =
+									val;
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   (sel == 2) &&	/* MAARI */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				kvm_write_maari(vcpu, val);
 			} else if (rd == MIPS_CP0_ERRCTL &&
 				   (sel == 0)) {	/* ErrCtl */
 				/* ignore the written value */
@@ -1441,6 +1506,8 @@ static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
 	if (cpu_guest_has_htw)
 		ret += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+	if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar)
+		ret += 1 + ARRAY_SIZE(vcpu->arch.maar);
 	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
 
 	return ret;
@@ -1492,6 +1559,19 @@ static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
 			return -EFAULT;
 		indices += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
 	}
+	if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar) {
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.maar); ++i) {
+			index = KVM_REG_MIPS_CP0_MAAR(i);
+			if (copy_to_user(indices, &index, sizeof(index)))
+				return -EFAULT;
+			++indices;
+		}
+
+		index = KVM_REG_MIPS_CP0_MAARI;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
 	for (i = 0; i < 6; ++i) {
 		if (!cpu_guest_has_kscr(i + 2))
 			continue;
@@ -1689,6 +1769,19 @@ static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		*v = read_gc0_config5();
 		break;
+	case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+		if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+			return -EINVAL;
+		*v = vcpu->arch.maar[idx];
+		break;
+	case KVM_REG_MIPS_CP0_MAARI:
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		*v = kvm_read_sw_gc0_maari(vcpu->arch.cop0);
+		break;
 #ifdef CONFIG_64BIT
 	case KVM_REG_MIPS_CP0_XCONTEXT:
 		*v = read_gc0_xcontext();
@@ -1938,6 +2031,19 @@ static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
 			write_gc0_config5(v);
 		}
 		break;
+	case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+		if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+			return -EINVAL;
+		vcpu->arch.maar[idx] = mips_process_maar(dmtc_op, v);
+		break;
+	case KVM_REG_MIPS_CP0_MAARI:
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		kvm_write_maari(vcpu, v);
+		break;
 #ifdef CONFIG_64BIT
 	case KVM_REG_MIPS_CP0_XCONTEXT:
 		write_gc0_xcontext(v);
-- 
cgit v1.2.3-55-g7522


From 08fab50da669e5ee5a542592895fcb63be3cd7b1 Mon Sep 17 00:00:00 2001
From: Fei Li
Date: Thu, 19 Jan 2017 17:02:26 +0100
Subject: KVM: s390: interface for suppressible I/O adapters

In order to properly implement adapter-interruption suppression, we
need a way for userspace to specify which adapters are subject to
suppression. Let's convert the existing (and unused) 'pad' field into
a 'flags' field and define a flag value for suppressible adapters.

Besides, add documentation for the interface.

Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/s390_flic.txt | 12 +++++++++---
 arch/s390/include/asm/kvm_host.h                |  1 +
 arch/s390/include/uapi/asm/kvm.h                |  4 +++-
 arch/s390/kvm/interrupt.c                       |  2 ++
 4 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index 6b0e115301c8..e8ee3b6edb05 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -64,12 +64,18 @@ struct kvm_s390_io_adapter {
 	__u8 isc;
 	__u8 maskable;
 	__u8 swap;
-	__u8 pad;
+	__u8 flags;
 };
 
    id contains the unique id for the adapter, isc the I/O interruption subclass
-   to use, maskable whether this adapter may be masked (interrupts turned off)
-   and swap whether the indicators need to be byte swapped.
+   to use, maskable whether this adapter may be masked (interrupts turned off),
+   swap whether the indicators need to be byte swapped, and flags contains
+   further characteristics of the adapter.
+   Currently defined values for 'flags' are:
+   - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS
+     (adapter-interrupt-suppression) facility. This flag only has an effect if
+     the AIS capability is enabled.
+   Unknown flag values are ignored.
 
 
   KVM_DEV_FLIC_ADAPTER_MODIFY
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 1af090d93bf5..499c72c2280d 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -621,6 +621,7 @@ struct s390_io_adapter {
 	bool maskable;
 	bool masked;
 	bool swap;
+	bool suppressible;
 	struct rw_semaphore maps_lock;
 	struct list_head maps;
 	atomic_t nr_maps;
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 5bd23cfd9ae5..5fa144d1df0a 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -41,9 +41,11 @@ struct kvm_s390_io_adapter {
 	__u8 isc;
 	__u8 maskable;
 	__u8 swap;
-	__u8 pad;
+	__u8 flags;
 };
 
+#define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01
+
 #define KVM_S390_IO_ADAPTER_MASK 1
 #define KVM_S390_IO_ADAPTER_MAP 2
 #define KVM_S390_IO_ADAPTER_UNMAP 3
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 311eef0df855..dba51ad62570 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1997,6 +1997,8 @@ static int register_io_adapter(struct kvm_device *dev,
 	adapter->maskable = adapter_info.maskable;
 	adapter->masked = false;
 	adapter->swap = adapter_info.swap;
+	adapter->suppressible = (adapter_info.flags) &
+				KVM_S390_ADAPTER_SUPPRESSIBLE;
 	dev->kvm->arch.adapters[adapter->id] = adapter;
 
 	return 0;
-- 
cgit v1.2.3-55-g7522


From 519783935451764b397f2a712de5ea778ff77fdf Mon Sep 17 00:00:00 2001
From: Fei Li
Date: Fri, 17 Feb 2017 17:06:26 +0800
Subject: KVM: s390: introduce ais mode modify function

Provide an interface for userspace to modify AIS
(adapter-interruption-suppression) mode state, and add documentation
for the interface. Allowed target modes are ALL-Interruptions mode
and SINGLE-Interruption mode.

We introduce the 'simm' and 'nimm' fields in kvm_s390_float_interrupt
to store interruption modes for each ISC. Each bit in 'simm' and
'nimm' targets to one ISC, and collaboratively indicate three modes:
ALL-Interruptions, SINGLE-Interruption and NO-Interruptions. This
interface can initiate most transitions between the states; transition
from SINGLE-Interruption to NO-Interruptions via adapter interrupt
injection will be introduced in a following patch. The meaningful
combinations are as follows:

    interruption mode | simm bit | nimm bit
    ------------------|----------|----------
             ALL      |    0     |     0
           SINGLE     |    1     |     0
             NO       |    1     |     1

Besides, add tracepoint to track AIS mode transitions.

Co-Authored-By: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/s390_flic.txt | 20 ++++++++++++
 arch/s390/include/asm/kvm_host.h                | 10 ++++++
 arch/s390/include/uapi/asm/kvm.h                |  6 ++++
 arch/s390/kvm/interrupt.c                       | 43 +++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.c                        |  4 +++
 arch/s390/kvm/trace-s390.h                      | 31 ++++++++++++++++++
 6 files changed, 114 insertions(+)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index e8ee3b6edb05..dfd42fd4abd5 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -14,6 +14,7 @@ FLIC provides support to
 - purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ)
 - enable/disable for the guest transparent async page faults
 - register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
+- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM)
 
 Groups:
   KVM_DEV_FLIC_ENQUEUE
@@ -107,6 +108,25 @@ struct kvm_s390_io_adapter_req {
       release a userspace page for the translated address specified in addr
       from the list of mappings
 
+  KVM_DEV_FLIC_AISM
+    modify the adapter-interruption-suppression mode for a given isc if the
+    AIS capability is enabled. Takes a kvm_s390_ais_req describing:
+
+struct kvm_s390_ais_req {
+	__u8 isc;
+	__u16 mode;
+};
+
+    isc contains the target I/O interruption subclass, mode the target
+    adapter-interruption-suppression mode. The following modes are
+    currently supported:
+    - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection
+      is always allowed;
+    - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq
+      injection is only allowed once and the following adapter interrupts
+      will be suppressed until the mode is set again to ALL-Interruptions
+      or SINGLE-Interruption mode.
+
 Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
 FLIC with an unknown group or attribute gives the error code EINVAL (instead of
 ENXIO, as specified in the API documentation). It is not possible to conclude
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 499c72c2280d..552c319483c6 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -521,6 +521,12 @@ struct kvm_s390_local_interrupt {
 #define FIRQ_CNTR_PFAULT   3
 #define FIRQ_MAX_COUNT     4
 
+/* mask the AIS mode for a given ISC */
+#define AIS_MODE_MASK(isc) (0x80 >> isc)
+
+#define KVM_S390_AIS_MODE_ALL    0
+#define KVM_S390_AIS_MODE_SINGLE 1
+
 struct kvm_s390_float_interrupt {
 	unsigned long pending_irqs;
 	spinlock_t lock;
@@ -530,6 +536,10 @@ struct kvm_s390_float_interrupt {
 	struct kvm_s390_ext_info srv_signal;
 	int next_rr_cpu;
 	unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+	struct mutex ais_lock;
+	u8 simm;
+	u8 nimm;
+	int ais_enabled;
 };
 
 struct kvm_hw_wp_info_arch {
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 5fa144d1df0a..50d2a927c990 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -26,6 +26,7 @@
 #define KVM_DEV_FLIC_ADAPTER_REGISTER	6
 #define KVM_DEV_FLIC_ADAPTER_MODIFY	7
 #define KVM_DEV_FLIC_CLEAR_IO_IRQ	8
+#define KVM_DEV_FLIC_AISM		9
 /*
  * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
  * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
@@ -46,6 +47,11 @@ struct kvm_s390_io_adapter {
 
 #define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01
 
+struct kvm_s390_ais_req {
+	__u8 isc;
+	__u16 mode;
+};
+
 #define KVM_S390_IO_ADAPTER_MASK 1
 #define KVM_S390_IO_ADAPTER_MAP 2
 #define KVM_S390_IO_ADAPTER_UNMAP 3
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index dba51ad62570..96b689e48c08 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2152,6 +2152,45 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr)
 	return 0;
 }
 
+static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_ais_req req;
+	int ret = 0;
+
+	if (!fi->ais_enabled)
+		return -ENOTSUPP;
+
+	if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
+		return -EFAULT;
+
+	if (req.isc > MAX_ISC)
+		return -EINVAL;
+
+	trace_kvm_s390_modify_ais_mode(req.isc,
+				       (fi->simm & AIS_MODE_MASK(req.isc)) ?
+				       (fi->nimm & AIS_MODE_MASK(req.isc)) ?
+				       2 : KVM_S390_AIS_MODE_SINGLE :
+				       KVM_S390_AIS_MODE_ALL, req.mode);
+
+	mutex_lock(&fi->ais_lock);
+	switch (req.mode) {
+	case KVM_S390_AIS_MODE_ALL:
+		fi->simm &= ~AIS_MODE_MASK(req.isc);
+		fi->nimm &= ~AIS_MODE_MASK(req.isc);
+		break;
+	case KVM_S390_AIS_MODE_SINGLE:
+		fi->simm |= AIS_MODE_MASK(req.isc);
+		fi->nimm &= ~AIS_MODE_MASK(req.isc);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	mutex_unlock(&fi->ais_lock);
+
+	return ret;
+}
+
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
@@ -2188,6 +2227,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
 		r = clear_io_irq(dev->kvm, attr);
 		break;
+	case KVM_DEV_FLIC_AISM:
+		r = modify_ais_mode(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2207,6 +2249,7 @@ static int flic_has_attr(struct kvm_device *dev,
 	case KVM_DEV_FLIC_ADAPTER_REGISTER:
 	case KVM_DEV_FLIC_ADAPTER_MODIFY:
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
+	case KVM_DEV_FLIC_AISM:
 		return 0;
 	}
 	return -ENXIO;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f83f18b77f3d..977cc1660a83 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1515,6 +1515,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	kvm_s390_crypto_init(kvm);
 
+	mutex_init(&kvm->arch.float_int.ais_lock);
+	kvm->arch.float_int.simm = 0;
+	kvm->arch.float_int.nimm = 0;
+	kvm->arch.float_int.ais_enabled = 0;
 	spin_lock_init(&kvm->arch.float_int.lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
 		INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]);
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 396485bca191..b32994d1546a 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -280,6 +280,37 @@ TRACE_EVENT(kvm_s390_enable_disable_ibs,
 		      __entry->state ? "enabling" : "disabling", __entry->id)
 	);
 
+/*
+ * Trace point for modifying ais mode for a given isc.
+ */
+TRACE_EVENT(kvm_s390_modify_ais_mode,
+	    TP_PROTO(__u8 isc, __u16 from, __u16 to),
+	    TP_ARGS(isc, from, to),
+
+	    TP_STRUCT__entry(
+		    __field(__u8, isc)
+		    __field(__u16, from)
+		    __field(__u16, to)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->isc = isc;
+		    __entry->from = from;
+		    __entry->to = to;
+		    ),
+
+	    TP_printk("for isc %x, modifying interruption mode from %s to %s",
+		      __entry->isc,
+		      (__entry->from == KVM_S390_AIS_MODE_ALL) ?
+		      "ALL-Interruptions Mode" :
+		      (__entry->from == KVM_S390_AIS_MODE_SINGLE) ?
+		      "Single-Interruption Mode" : "No-Interruptions Mode",
+		      (__entry->to == KVM_S390_AIS_MODE_ALL) ?
+		      "ALL-Interruptions Mode" :
+		      (__entry->to == KVM_S390_AIS_MODE_SINGLE) ?
+		      "Single-Interruption Mode" : "No-Interruptions Mode")
+	);
+
 
 #endif /* _TRACE_KVMS390_H */
 
-- 
cgit v1.2.3-55-g7522


From a8920950131b1394f9e99ff57a5cf5ceeb0cc25c Mon Sep 17 00:00:00 2001
From: Yi Min Zhao
Date: Mon, 20 Feb 2017 10:15:01 +0800
Subject: KVM: s390: introduce adapter interrupt inject function

Inject adapter interrupts on a specified adapter which allows to
retrieve the adapter flags, e.g. if the adapter is subject to AIS
facility or not. And add documentation for this interface.

For adapters subject to AIS, handle the airq injection suppression
for a given ISC according to the interruption mode:
- before injection, if NO-Interruptions Mode, just return 0 and
  suppress, otherwise, allow the injection.
- after injection, if SINGLE-Interruption Mode, change it to
  NO-Interruptions Mode to suppress the following interrupts.

Besides, add tracepoint for suppressed airq and AIS mode transitions.

Signed-off-by: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/s390_flic.txt |  9 +++++
 arch/s390/include/uapi/asm/kvm.h                |  1 +
 arch/s390/kvm/interrupt.c                       | 53 ++++++++++++++++++++++---
 arch/s390/kvm/trace-s390.h                      | 21 ++++++++++
 4 files changed, 78 insertions(+), 6 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index dfd42fd4abd5..c2518cea8ab4 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -15,6 +15,7 @@ FLIC provides support to
 - enable/disable for the guest transparent async page faults
 - register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
 - modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM)
+- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT)
 
 Groups:
   KVM_DEV_FLIC_ENQUEUE
@@ -127,6 +128,14 @@ struct kvm_s390_ais_req {
       will be suppressed until the mode is set again to ALL-Interruptions
       or SINGLE-Interruption mode.
 
+  KVM_DEV_FLIC_AIRQ_INJECT
+    Inject adapter interrupts on a specified adapter.
+    attr->attr contains the unique id for the adapter, which allows for
+    adapter-specific checks and actions.
+    For adapters subject to AIS, handle the airq injection suppression for
+    an isc according to the adapter-interruption-suppression mode on condition
+    that the AIS capability is enabled.
+
 Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
 FLIC with an unknown group or attribute gives the error code EINVAL (instead of
 ENXIO, as specified in the API documentation). It is not possible to conclude
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 50d2a927c990..2c9ad251fa33 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -27,6 +27,7 @@
 #define KVM_DEV_FLIC_ADAPTER_MODIFY	7
 #define KVM_DEV_FLIC_CLEAR_IO_IRQ	8
 #define KVM_DEV_FLIC_AISM		9
+#define KVM_DEV_FLIC_AIRQ_INJECT	10
 /*
  * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
  * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 96b689e48c08..482673e3436d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2191,6 +2191,48 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
 	return ret;
 }
 
+static int kvm_s390_inject_airq(struct kvm *kvm,
+				struct s390_io_adapter *adapter)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_interrupt s390int = {
+		.type = KVM_S390_INT_IO(1, 0, 0, 0),
+		.parm = 0,
+		.parm64 = (adapter->isc << 27) | 0x80000000,
+	};
+	int ret = 0;
+
+	if (!fi->ais_enabled || !adapter->suppressible)
+		return kvm_s390_inject_vm(kvm, &s390int);
+
+	mutex_lock(&fi->ais_lock);
+	if (fi->nimm & AIS_MODE_MASK(adapter->isc)) {
+		trace_kvm_s390_airq_suppressed(adapter->id, adapter->isc);
+		goto out;
+	}
+
+	ret = kvm_s390_inject_vm(kvm, &s390int);
+	if (!ret && (fi->simm & AIS_MODE_MASK(adapter->isc))) {
+		fi->nimm |= AIS_MODE_MASK(adapter->isc);
+		trace_kvm_s390_modify_ais_mode(adapter->isc,
+					       KVM_S390_AIS_MODE_SINGLE, 2);
+	}
+out:
+	mutex_unlock(&fi->ais_lock);
+	return ret;
+}
+
+static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	unsigned int id = attr->attr;
+	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+
+	if (!adapter)
+		return -EINVAL;
+
+	return kvm_s390_inject_airq(kvm, adapter);
+}
+
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
@@ -2230,6 +2272,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_AISM:
 		r = modify_ais_mode(dev->kvm, attr);
 		break;
+	case KVM_DEV_FLIC_AIRQ_INJECT:
+		r = flic_inject_airq(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2250,6 +2295,7 @@ static int flic_has_attr(struct kvm_device *dev,
 	case KVM_DEV_FLIC_ADAPTER_MODIFY:
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
 	case KVM_DEV_FLIC_AISM:
+	case KVM_DEV_FLIC_AIRQ_INJECT:
 		return 0;
 	}
 	return -ENXIO;
@@ -2360,12 +2406,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	ret = adapter_indicators_set(kvm, adapter, &e->adapter);
 	up_read(&adapter->maps_lock);
 	if ((ret > 0) && !adapter->masked) {
-		struct kvm_s390_interrupt s390int = {
-			.type = KVM_S390_INT_IO(1, 0, 0, 0),
-			.parm = 0,
-			.parm64 = (adapter->isc << 27) | 0x80000000,
-		};
-		ret = kvm_s390_inject_vm(kvm, &s390int);
+		ret = kvm_s390_inject_airq(kvm, adapter);
 		if (ret == 0)
 			ret = 1;
 	}
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index b32994d1546a..78b7e847984a 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -311,6 +311,27 @@ TRACE_EVENT(kvm_s390_modify_ais_mode,
 		      "Single-Interruption Mode" : "No-Interruptions Mode")
 	);
 
+/*
+ * Trace point for suppressed adapter I/O interrupt.
+ */
+TRACE_EVENT(kvm_s390_airq_suppressed,
+	    TP_PROTO(__u32 id, __u8 isc),
+	    TP_ARGS(id, isc),
+
+	    TP_STRUCT__entry(
+		    __field(__u32, id)
+		    __field(__u8, isc)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->isc = isc;
+		    ),
+
+	    TP_printk("adapter I/O interrupt suppressed (id:%x isc:%x)",
+		      __entry->id, __entry->isc)
+	);
+
 
 #endif /* _TRACE_KVMS390_H */
 
-- 
cgit v1.2.3-55-g7522


From 47a4693e1d3eb09e523c223753fb5a97721f49b8 Mon Sep 17 00:00:00 2001
From: Yi Min Zhao
Date: Fri, 10 Mar 2017 09:29:38 +0100
Subject: KVM: s390: introduce AIS capability

Introduce a cap to enable AIS facility bit, and add documentation
for this capability.

Signed-off-by: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Signed-off-by: Fei Li <sherrylf@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/api.txt |  8 ++++++++
 arch/s390/kvm/kvm-s390.c          | 15 +++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 3 files changed, 24 insertions(+)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 725250858479..598278cd0dc5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4110,6 +4110,14 @@ Returns: 0 on success; -EINVAL if the machine does not support
 
 Allows use of guarded storage for the KVM guest.
 
+7.10 KVM_CAP_S390_AIS
+
+Architectures: s390
+Parameters: none
+
+Allow use of adapter-interruption suppression.
+Returns: 0 on success; -EBUSY if a VCPU has already been created.
+
 8. Other capabilities.
 ----------------------
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 977cc1660a83..11b7d6638991 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -380,6 +380,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
 	case KVM_CAP_S390_USER_INSTR0:
+	case KVM_CAP_S390_AIS:
 		r = 1;
 		break;
 	case KVM_CAP_S390_MEM_OP:
@@ -544,6 +545,20 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s",
 			 r ? "(not available)" : "(success)");
 		break;
+	case KVM_CAP_S390_AIS:
+		mutex_lock(&kvm->lock);
+		if (kvm->created_vcpus) {
+			r = -EBUSY;
+		} else {
+			set_kvm_facility(kvm->arch.model.fac_mask, 72);
+			set_kvm_facility(kvm->arch.model.fac_list, 72);
+			kvm->arch.float_int.ais_enabled = 1;
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: AIS %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	case KVM_CAP_S390_GS:
 		r = -EINVAL;
 		mutex_lock(&kvm->lock);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c9d522765f8f..33dd2a4e36dc 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -884,6 +884,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
 #define KVM_CAP_S390_GS 137
+#define KVM_CAP_S390_AIS 138
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-55-g7522


From ad6260da1e23cf937806e42c8490af3ff4530474 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 27 Mar 2017 14:30:40 +0200
Subject: KVM: x86: drop legacy device assignment

Legacy device assignment has been deprecated since 4.2 (released
1.5 years ago).  VFIO is better and everyone should have switched to it.
If they haven't, this should convince them. :)

Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  204 -------
 arch/x86/kvm/Kconfig              |   12 -
 arch/x86/kvm/Makefile             |    2 -
 arch/x86/kvm/assigned-dev.c       | 1058 -------------------------------------
 arch/x86/kvm/assigned-dev.h       |   32 --
 arch/x86/kvm/iommu.c              |  356 -------------
 arch/x86/kvm/x86.c                |   14 +-
 include/linux/kvm_host.h          |   16 -
 virt/kvm/kvm_main.c               |   17 -
 9 files changed, 1 insertion(+), 1710 deletions(-)
 delete mode 100644 arch/x86/kvm/assigned-dev.c
 delete mode 100644 arch/x86/kvm/assigned-dev.h
 delete mode 100644 arch/x86/kvm/iommu.c

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 753e88e5eb2a..1a184843bf9c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1326,130 +1326,6 @@ The flags bitmap is defined as:
    /* the host supports the ePAPR idle hcall
    #define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
 
-4.48 KVM_ASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Assigns a host PCI device to the VM.
-
-struct kvm_assigned_pci_dev {
-	__u32 assigned_dev_id;
-	__u32 busnr;
-	__u32 devfn;
-	__u32 flags;
-	__u32 segnr;
-	union {
-		__u32 reserved[11];
-	};
-};
-
-The PCI device is specified by the triple segnr, busnr, and devfn.
-Identification in succeeding service requests is done via assigned_dev_id. The
-following flags are specified:
-
-/* Depends on KVM_CAP_IOMMU */
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-/* The following two depend on KVM_CAP_PCI_2_3 */
-#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-
-If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
-via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
-assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
-guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
-
-The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
-isolation of the device.  Usages not specifying this flag are deprecated.
-
-Only PCI header type 0 devices with PCI BAR resources are supported by
-device assignment.  The user requesting this ioctl must have read/write
-access to the PCI sysfs resource files associated with the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.49 KVM_DEASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Ends PCI device assignment, releasing all associated resources.
-
-See KVM_ASSIGN_PCI_DEVICE for the data structure. Only assigned_dev_id is
-used in kvm_assigned_pci_dev to identify the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-4.50 KVM_ASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Assigns an IRQ to a passed-through device.
-
-struct kvm_assigned_irq {
-	__u32 assigned_dev_id;
-	__u32 host_irq; /* ignored (legacy field) */
-	__u32 guest_irq;
-	__u32 flags;
-	union {
-		__u32 reserved[12];
-	};
-};
-
-The following flags are defined:
-
-#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
-#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
-#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
-
-#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
-#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
-#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
-
-It is not valid to specify multiple types per host or guest IRQ. However, the
-IRQ type of host and guest can differ or can even be null.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.51 KVM_DEASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Ends an IRQ assignment to a passed-through device.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
-by assigned_dev_id, flags must correspond to the IRQ type specified on
-KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
-
-
 4.52 KVM_SET_GSI_ROUTING
 
 Capability: KVM_CAP_IRQ_ROUTING
@@ -1536,52 +1412,6 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
-4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_nr (in)
-Returns: 0 on success, -1 on error
-
-Set the number of MSI-X interrupts for an assigned device. The number is
-reset again by terminating the MSI-X assignment of the device via
-KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
-point will fail.
-
-struct kvm_assigned_msix_nr {
-	__u32 assigned_dev_id;
-	__u16 entry_nr;
-	__u16 padding;
-};
-
-#define KVM_MAX_MSIX_PER_DEV		256
-
-
-4.54 KVM_ASSIGN_SET_MSIX_ENTRY (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_entry (in)
-Returns: 0 on success, -1 on error
-
-Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
-the GSI vector to zero means disabling the interrupt.
-
-struct kvm_assigned_msix_entry {
-	__u32 assigned_dev_id;
-	__u32 gsi;
-	__u16 entry; /* The index of entry in the MSI-X table */
-	__u16 padding[3];
-};
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
 
 4.55 KVM_SET_TSC_KHZ
 
@@ -1733,40 +1563,6 @@ should skip processing the bitmap and just invalidate everything.  It must
 be set to the number of set bits in the bitmap.
 
 
-4.61 KVM_ASSIGN_SET_INTX_MASK (deprecated)
-
-Capability: KVM_CAP_PCI_2_3
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Allows userspace to mask PCI INTx interrupts from the assigned device.  The
-kernel will not deliver INTx interrupts to the guest between setting and
-clearing of KVM_ASSIGN_SET_INTX_MASK via this interface.  This enables use of
-and emulation of PCI 2.3 INTx disable command register behavior.
-
-This may be used for both PCI 2.3 devices supporting INTx disable natively and
-older devices lacking this support. Userspace is responsible for emulating the
-read value of the INTx disable bit in the guest visible PCI command register.
-When modifying the INTx disable state, userspace should precede updating the
-physical device command register by calling this ioctl to inform the kernel of
-the new intended INTx mask state.
-
-Note that the kernel uses the device INTx disable bit to internally manage the
-device interrupt state for PCI 2.3 devices.  Reads of this register may
-therefore not match the expected value.  Writes should always use the guest
-intended INTx disable value rather than attempting to read-copy-update the
-current physical device state.  Races between user and kernel updates to the
-INTx disable bit are handled lazily in the kernel.  It's possible the device
-may generate unintended interrupts, but they will not be injected into the
-guest.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
-by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
-evaluated.
-
-
 4.62 KVM_CREATE_SPAPR_TCE
 
 Capability: KVM_CAP_SPAPR_TCE
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ab8e32f7b9a8..760433b2574a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,18 +86,6 @@ config KVM_MMU_AUDIT
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
 	 auditing of KVM MMU events at runtime.
 
-config KVM_DEVICE_ASSIGNMENT
-	bool "KVM legacy PCI device assignment support (DEPRECATED)"
-	depends on KVM && PCI && IOMMU_API
-	default n
-	---help---
-	  Provide support for legacy PCI device assignment through KVM.  The
-	  kernel now also supports a full featured userspace device driver
-	  framework through VFIO, which supersedes this support and provides
-	  better security.
-
-	  If unsure, say N.
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3bff20710471..09d4b17be022 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,8 +15,6 @@ kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
 			   hyperv.o page_track.o debugfs.o
 
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
-
 kvm-intel-y		+= vmx.o pmu_intel.o
 kvm-amd-y		+= svm.o pmu_amd.o
 
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
deleted file mode 100644
index 308b8597c691..000000000000
--- a/arch/x86/kvm/assigned-dev.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * Kernel-based Virtual Machine - device assignment support
- *
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-#include "irq.h"
-#include "assigned-dev.h"
-#include "trace/events/kvm.h"
-
-struct kvm_assigned_dev_kernel {
-	struct kvm_irq_ack_notifier ack_notifier;
-	struct list_head list;
-	int assigned_dev_id;
-	int host_segnr;
-	int host_busnr;
-	int host_devfn;
-	unsigned int entries_nr;
-	int host_irq;
-	bool host_irq_disabled;
-	bool pci_2_3;
-	struct msix_entry *host_msix_entries;
-	int guest_irq;
-	struct msix_entry *guest_msix_entries;
-	unsigned long irq_requested_type;
-	int irq_source_id;
-	int flags;
-	struct pci_dev *dev;
-	struct kvm *kvm;
-	spinlock_t intx_lock;
-	spinlock_t intx_mask_lock;
-	char irq_name[32];
-	struct pci_saved_state *pci_saved_state;
-};
-
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-						      int assigned_dev_id)
-{
-	struct kvm_assigned_dev_kernel *match;
-
-	list_for_each_entry(match, head, list) {
-		if (match->assigned_dev_id == assigned_dev_id)
-			return match;
-	}
-	return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-				    *assigned_dev, int irq)
-{
-	int i, index;
-	struct msix_entry *host_msix_entries;
-
-	host_msix_entries = assigned_dev->host_msix_entries;
-
-	index = -1;
-	for (i = 0; i < assigned_dev->entries_nr; i++)
-		if (irq == host_msix_entries[i].vector) {
-			index = i;
-			break;
-		}
-	if (index < 0)
-		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-
-	return index;
-}
-
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret;
-
-	spin_lock(&assigned_dev->intx_lock);
-	if (pci_check_and_mask_intx(assigned_dev->dev)) {
-		assigned_dev->host_irq_disabled = true;
-		ret = IRQ_WAKE_THREAD;
-	} else
-		ret = IRQ_NONE;
-	spin_unlock(&assigned_dev->intx_lock);
-
-	return ret;
-}
-
-static void
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
-				 int vector)
-{
-	if (unlikely(assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_GUEST_INTX)) {
-		spin_lock(&assigned_dev->intx_mask_lock);
-		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
-			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id, vector, 1,
-				    false);
-		spin_unlock(&assigned_dev->intx_mask_lock);
-	} else
-		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    vector, 1, false);
-}
-
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-		spin_lock_irq(&assigned_dev->intx_lock);
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-		spin_unlock_irq(&assigned_dev->intx_lock);
-	}
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
-				int level)
-{
-	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	struct kvm_kernel_irq_routing_entry *e;
-	int ret = -EINVAL;
-	int idx;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	/*
-	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
-	 * which would need to be retried from thread context;  when same GSI
-	 * is connected to both PIC and IOAPIC, we'd have to report a
-	 * partial failure here.
-	 * Since there's no easy way to do this, we only support injecting MSI
-	 * which is limited to 1:1 GSI mapping.
-	 */
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-		e = &entries[0];
-		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
-						irq, level);
-	}
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
-
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-				       assigned_dev->irq_source_id,
-				       assigned_dev->guest_irq, 1);
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-	int ret = 0;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-					   assigned_dev->irq_source_id,
-					   vector, 1);
-	}
-
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
-	}
-
-	return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-	struct kvm_assigned_dev_kernel *dev =
-		container_of(kian, struct kvm_assigned_dev_kernel,
-			     ack_notifier);
-
-	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
-
-	spin_lock(&dev->intx_mask_lock);
-
-	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
-		bool reassert = false;
-
-		spin_lock_irq(&dev->intx_lock);
-		/*
-		 * The guest IRQ may be shared so this ack can come from an
-		 * IRQ for another guest device.
-		 */
-		if (dev->host_irq_disabled) {
-			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
-				enable_irq(dev->host_irq);
-			else if (!pci_check_and_unmask_intx(dev->dev))
-				reassert = true;
-			dev->host_irq_disabled = reassert;
-		}
-		spin_unlock_irq(&dev->intx_lock);
-
-		if (reassert)
-			kvm_set_irq(dev->kvm, dev->irq_source_id,
-				    dev->guest_irq, 1, false);
-	}
-
-	spin_unlock(&dev->intx_mask_lock);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
-			       struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	if (assigned_dev->ack_notifier.gsi != -1)
-		kvm_unregister_irq_ack_notifier(kvm,
-						&assigned_dev->ack_notifier);
-
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 0, false);
-
-	if (assigned_dev->irq_source_id != -1)
-		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-	assigned_dev->irq_source_id = -1;
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-			      struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	/*
-	 * We disable irq here to prevent further events.
-	 *
-	 * Notice this maybe result in nested disable if the interrupt type is
-	 * INTx, but it's OK for we are going to free it.
-	 *
-	 * If this function is a part of VM destroy, please ensure that till
-	 * now, the kvm state is still legal for probably we also have to wait
-	 * on a currently running IRQ handler.
-	 */
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int i;
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			disable_irq(assigned_dev->host_msix_entries[i].vector);
-
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			free_irq(assigned_dev->host_msix_entries[i].vector,
-				 assigned_dev);
-
-		assigned_dev->entries_nr = 0;
-		kfree(assigned_dev->host_msix_entries);
-		kfree(assigned_dev->guest_msix_entries);
-		pci_disable_msix(assigned_dev->dev);
-	} else {
-		/* Deal with MSI and INTx */
-		if ((assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_HOST_INTX) &&
-		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			spin_lock_irq(&assigned_dev->intx_lock);
-			pci_intx(assigned_dev->dev, false);
-			spin_unlock_irq(&assigned_dev->intx_lock);
-			synchronize_irq(assigned_dev->host_irq);
-		} else
-			disable_irq(assigned_dev->host_irq);
-
-		free_irq(assigned_dev->host_irq, assigned_dev);
-
-		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-			pci_disable_msi(assigned_dev->dev);
-	}
-
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *assigned_dev,
-			    unsigned long irq_requested_type)
-{
-	unsigned long guest_irq_type, host_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return -EINVAL;
-	/* no irq assignment to deassign */
-	if (!assigned_dev->irq_requested_type)
-		return -ENXIO;
-
-	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
-	if (host_irq_type)
-		deassign_host_irq(kvm, assigned_dev);
-	if (guest_irq_type)
-		deassign_guest_irq(kvm, assigned_dev);
-
-	return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
-{
-	kvm_free_assigned_irq(kvm, assigned_dev);
-
-	pci_reset_function(assigned_dev->dev);
-	if (pci_load_and_free_saved_state(assigned_dev->dev,
-					  &assigned_dev->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&assigned_dev->dev->dev));
-	else
-		pci_restore_state(assigned_dev->dev);
-
-	pci_clear_dev_assigned(assigned_dev->dev);
-
-	pci_release_regions(assigned_dev->dev);
-	pci_disable_device(assigned_dev->dev);
-	pci_dev_put(assigned_dev->dev);
-
-	list_del(&assigned_dev->list);
-	kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
-
-	list_for_each_entry_safe(assigned_dev, tmp,
-				 &kvm->arch.assigned_dev_head, list) {
-		kvm_free_assigned_device(kvm, assigned_dev);
-	}
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	irq_handler_t irq_handler;
-	unsigned long flags;
-
-	dev->host_irq = dev->dev->irq;
-
-	/*
-	 * We can only share the IRQ line with other host devices if we are
-	 * able to disable the IRQ source at device-level - independently of
-	 * the guest driver. Otherwise host devices may suffer from unbounded
-	 * IRQ latencies when the guest keeps the line asserted.
-	 */
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		irq_handler = kvm_assigned_dev_intx;
-		flags = IRQF_SHARED;
-	} else {
-		irq_handler = NULL;
-		flags = IRQF_ONESHOT;
-	}
-	if (request_threaded_irq(dev->host_irq, irq_handler,
-				 kvm_assigned_dev_thread_intx, flags,
-				 dev->irq_name, dev))
-		return -EIO;
-
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		spin_lock_irq(&dev->intx_lock);
-		pci_intx(dev->dev, true);
-		spin_unlock_irq(&dev->intx_lock);
-	}
-	return 0;
-}
-
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-					   struct kvm_assigned_dev_kernel *dev)
-{
-	int r;
-
-	if (!dev->dev->msi_enabled) {
-		r = pci_enable_msi(dev->dev);
-		if (r)
-			return r;
-	}
-
-	dev->host_irq = dev->dev->irq;
-	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
-				 kvm_assigned_dev_thread_msi, 0,
-				 dev->irq_name, dev)) {
-		pci_disable_msi(dev->dev);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	int i, r = -EINVAL;
-
-	/* host_msix_entries and guest_msix_entries should have been
-	 * initialized */
-	if (dev->entries_nr == 0)
-		return r;
-
-	r = pci_enable_msix_exact(dev->dev,
-				  dev->host_msix_entries, dev->entries_nr);
-	if (r)
-		return r;
-
-	for (i = 0; i < dev->entries_nr; i++) {
-		r = request_threaded_irq(dev->host_msix_entries[i].vector,
-					 kvm_assigned_dev_msix,
-					 kvm_assigned_dev_thread_msix,
-					 0, dev->irq_name, dev);
-		if (r)
-			goto err;
-	}
-
-	return 0;
-err:
-	for (i -= 1; i >= 0; i--)
-		free_irq(dev->host_msix_entries[i].vector, dev);
-	pci_disable_msix(dev->dev);
-	return r;
-}
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-				struct kvm_assigned_dev_kernel *dev,
-				struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = irq->guest_irq;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assign_host_irq(struct kvm *kvm,
-			   struct kvm_assigned_dev_kernel *dev,
-			   __u32 host_irq_type)
-{
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-		return r;
-
-	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
-		 pci_name(dev->dev));
-
-	switch (host_irq_type) {
-	case KVM_DEV_IRQ_HOST_INTX:
-		r = assigned_device_enable_host_intx(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSI:
-		r = assigned_device_enable_host_msi(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSIX:
-		r = assigned_device_enable_host_msix(kvm, dev);
-		break;
-	default:
-		r = -EINVAL;
-	}
-	dev->host_irq_disabled = false;
-
-	if (!r)
-		dev->irq_requested_type |= host_irq_type;
-
-	return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *dev,
-			    struct kvm_assigned_irq *irq,
-			    unsigned long guest_irq_type)
-{
-	int id;
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-		return r;
-
-	id = kvm_request_irq_source_id(kvm);
-	if (id < 0)
-		return id;
-
-	dev->irq_source_id = id;
-
-	switch (guest_irq_type) {
-	case KVM_DEV_IRQ_GUEST_INTX:
-		r = assigned_device_enable_guest_intx(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSI:
-		r = assigned_device_enable_guest_msi(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSIX:
-		r = assigned_device_enable_guest_msix(kvm, dev, irq);
-		break;
-	default:
-		r = -EINVAL;
-	}
-
-	if (!r) {
-		dev->irq_requested_type |= guest_irq_type;
-		if (dev->ack_notifier.gsi != -1)
-			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-	} else {
-		kvm_free_irq_source_id(kvm, dev->irq_source_id);
-		dev->irq_source_id = -1;
-	}
-
-	return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq *assigned_irq)
-{
-	int r = -EINVAL;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long host_irq_type, guest_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return r;
-
-	mutex_lock(&kvm->lock);
-	r = -ENODEV;
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
-	r = -EINVAL;
-	/* can only assign one type at a time */
-	if (hweight_long(host_irq_type) > 1)
-		goto out;
-	if (hweight_long(guest_irq_type) > 1)
-		goto out;
-	if (host_irq_type == 0 && guest_irq_type == 0)
-		goto out;
-
-	r = 0;
-	if (host_irq_type)
-		r = assign_host_irq(kvm, match, host_irq_type);
-	if (r)
-		goto out;
-
-	if (guest_irq_type)
-		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-					 struct kvm_assigned_irq
-					 *assigned_irq)
-{
-	int r = -ENODEV;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long irq_type;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
-					  KVM_DEV_IRQ_GUEST_MASK);
-	r = kvm_deassign_irq(kvm, match, irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-/*
- * We want to test whether the caller has been granted permissions to
- * use this device.  To be able to configure and control the device,
- * the user needs access to PCI configuration space and BAR resources.
- * These are accessed through PCI sysfs.  PCI config space is often
- * passed to the process calling this ioctl via file descriptor, so we
- * can't rely on access to that file.  We can check for permissions
- * on each of the BAR resource files, which is a pretty clear
- * indicator that the user has been granted access to the device.
- */
-static int probe_sysfs_permissions(struct pci_dev *dev)
-{
-#ifdef CONFIG_SYSFS
-	int i;
-	bool bar_found = false;
-
-	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
-		char *kpath, *syspath;
-		struct path path;
-		struct inode *inode;
-		int r;
-
-		if (!pci_resource_len(dev, i))
-			continue;
-
-		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
-		if (!kpath)
-			return -ENOMEM;
-
-		/* Per sysfs-rules, sysfs is always at /sys */
-		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
-		kfree(kpath);
-		if (!syspath)
-			return -ENOMEM;
-
-		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
-		kfree(syspath);
-		if (r)
-			return r;
-
-		inode = d_backing_inode(path.dentry);
-
-		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
-		path_put(&path);
-		if (r)
-			return r;
-
-		bar_found = true;
-	}
-
-	/* If no resources, probably something special */
-	if (!bar_found)
-		return -EPERM;
-
-	return 0;
-#else
-	return -EINVAL; /* No way to control the device without sysfs */
-#endif
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-				      struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0, idx;
-	struct kvm_assigned_dev_kernel *match;
-	struct pci_dev *dev;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
-		return -EINVAL;
-
-	mutex_lock(&kvm->lock);
-	idx = srcu_read_lock(&kvm->srcu);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (match) {
-		/* device already assigned */
-		r = -EEXIST;
-		goto out;
-	}
-
-	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-	if (match == NULL) {
-		printk(KERN_INFO "%s: Couldn't allocate memory\n",
-		       __func__);
-		r = -ENOMEM;
-		goto out;
-	}
-	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
-				   assigned_dev->busnr,
-				   assigned_dev->devfn);
-	if (!dev) {
-		printk(KERN_INFO "%s: host device not found\n", __func__);
-		r = -EINVAL;
-		goto out_free;
-	}
-
-	/* Don't allow bridges to be assigned */
-	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
-		r = -EPERM;
-		goto out_put;
-	}
-
-	r = probe_sysfs_permissions(dev);
-	if (r)
-		goto out_put;
-
-	if (pci_enable_device(dev)) {
-		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-		r = -EBUSY;
-		goto out_put;
-	}
-	r = pci_request_regions(dev, "kvm_assigned_device");
-	if (r) {
-		printk(KERN_INFO "%s: Could not get access to device regions\n",
-		       __func__);
-		goto out_disable;
-	}
-
-	pci_reset_function(dev);
-	pci_save_state(dev);
-	match->pci_saved_state = pci_store_saved_state(dev);
-	if (!match->pci_saved_state)
-		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-
-	if (!pci_intx_mask_supported(dev))
-		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
-
-	match->assigned_dev_id = assigned_dev->assigned_dev_id;
-	match->host_segnr = assigned_dev->segnr;
-	match->host_busnr = assigned_dev->busnr;
-	match->host_devfn = assigned_dev->devfn;
-	match->flags = assigned_dev->flags;
-	match->dev = dev;
-	spin_lock_init(&match->intx_lock);
-	spin_lock_init(&match->intx_mask_lock);
-	match->irq_source_id = -1;
-	match->kvm = kvm;
-	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-
-	list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-	if (!kvm->arch.iommu_domain) {
-		r = kvm_iommu_map_guest(kvm);
-		if (r)
-			goto out_list_del;
-	}
-	r = kvm_assign_device(kvm, match->dev);
-	if (r)
-		goto out_list_del;
-
-out:
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-out_list_del:
-	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-	list_del(&match->list);
-	pci_release_regions(dev);
-out_disable:
-	pci_disable_device(dev);
-out_put:
-	pci_dev_put(dev);
-out_free:
-	kfree(match);
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		printk(KERN_INFO "%s: device hasn't been assigned before, "
-		  "so cannot be deassigned\n", __func__);
-		r = -EINVAL;
-		goto out;
-	}
-
-	kvm_deassign_device(kvm, match->dev);
-
-	kvm_free_assigned_device(kvm, match);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-				    struct kvm_assigned_msix_nr *entry_nr)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry_nr->assigned_dev_id);
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_nr_out;
-	}
-
-	if (adev->entries_nr == 0) {
-		adev->entries_nr = entry_nr->entry_nr;
-		if (adev->entries_nr == 0 ||
-		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
-			r = -EINVAL;
-			goto msix_nr_out;
-		}
-
-		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-						entry_nr->entry_nr,
-						GFP_KERNEL);
-		if (!adev->host_msix_entries) {
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-		adev->guest_msix_entries =
-			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
-				GFP_KERNEL);
-		if (!adev->guest_msix_entries) {
-			kfree(adev->host_msix_entries);
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-	} else /* Not allowed set MSI-X number twice */
-		r = -EINVAL;
-msix_nr_out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-				       struct kvm_assigned_msix_entry *entry)
-{
-	int r = 0, i;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry->assigned_dev_id);
-
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_entry_out;
-	}
-
-	for (i = 0; i < adev->entries_nr; i++)
-		if (adev->guest_msix_entries[i].vector == 0 ||
-		    adev->guest_msix_entries[i].entry == entry->entry) {
-			adev->guest_msix_entries[i].entry = entry->entry;
-			adev->guest_msix_entries[i].vector = entry->gsi;
-			adev->host_msix_entries[i].entry = entry->entry;
-			break;
-		}
-	if (i == adev->entries_nr) {
-		r = -ENOSPC;
-		goto msix_entry_out;
-	}
-
-msix_entry_out:
-	mutex_unlock(&kvm->lock);
-
-	return r;
-}
-
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		r = -ENODEV;
-		goto out;
-	}
-
-	spin_lock(&match->intx_mask_lock);
-
-	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
-	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-
-	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
-			kvm_set_irq(match->kvm, match->irq_source_id,
-				    match->guest_irq, 0, false);
-			/*
-			 * Masking at hardware-level is performed on demand,
-			 * i.e. when an IRQ actually arrives at the host.
-			 */
-		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			/*
-			 * Unmask the IRQ line if required. Unmasking at
-			 * device level will be performed by user space.
-			 */
-			spin_lock_irq(&match->intx_lock);
-			if (match->host_irq_disabled) {
-				enable_irq(match->host_irq);
-				match->host_irq_disabled = false;
-			}
-			spin_unlock_irq(&match->intx_lock);
-		}
-	}
-
-	spin_unlock(&match->intx_mask_lock);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int r;
-
-	switch (ioctl) {
-	case KVM_ASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_IRQ: {
-		r = -EOPNOTSUPP;
-		break;
-	}
-	case KVM_ASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_NR: {
-		struct kvm_assigned_msix_nr entry_nr;
-		r = -EFAULT;
-		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_ENTRY: {
-		struct kvm_assigned_msix_entry entry;
-		r = -EFAULT;
-		if (copy_from_user(&entry, argp, sizeof entry))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_INTX_MASK: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
-		break;
-	}
-	default:
-		r = -ENOTTY;
-		break;
-	}
-out:
-	return r;
-}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
deleted file mode 100644
index a428c1a211b2..000000000000
--- a/arch/x86/kvm/assigned-dev.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
-#define ARCH_X86_KVM_ASSIGNED_DEV_H
-
-#include <linux/kvm_host.h>
-
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
-
-int kvm_iommu_map_guest(struct kvm *kvm);
-int kvm_iommu_unmap_guest(struct kvm *kvm);
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg);
-
-void kvm_free_all_assigned_devices(struct kvm *kvm);
-#else
-static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	return 0;
-}
-
-static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-						unsigned long arg)
-{
-	return -ENOTTY;
-}
-
-static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
-#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
-
-#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
deleted file mode 100644
index b181426f67b4..000000000000
--- a/arch/x86/kvm/iommu.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <linux/iommu.h>
-#include "assigned-dev.h"
-
-static bool allow_unsafe_assigned_interrupts;
-module_param_named(allow_unsafe_assigned_interrupts,
-		   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
- "Enable device assignment on platforms without interrupt remapping support.");
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages);
-
-static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
-			   unsigned long npages)
-{
-	gfn_t end_gfn;
-	kvm_pfn_t pfn;
-
-	pfn     = gfn_to_pfn_memslot(slot, gfn);
-	end_gfn = gfn + npages;
-	gfn    += 1;
-
-	if (is_error_noslot_pfn(pfn))
-		return pfn;
-
-	while (gfn < end_gfn)
-		gfn_to_pfn_memslot(slot, gfn++);
-
-	return pfn;
-}
-
-static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
-		unsigned long npages)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i)
-		kvm_release_pfn_clean(pfn + i);
-}
-
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	gfn_t gfn, end_gfn;
-	kvm_pfn_t pfn;
-	int r = 0;
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int flags;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	gfn     = slot->base_gfn;
-	end_gfn = gfn + slot->npages;
-
-	flags = IOMMU_READ;
-	if (!(slot->flags & KVM_MEM_READONLY))
-		flags |= IOMMU_WRITE;
-	if (!kvm->arch.iommu_noncoherent)
-		flags |= IOMMU_CACHE;
-
-
-	while (gfn < end_gfn) {
-		unsigned long page_size;
-
-		/* Check if already mapped */
-		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Get the page size we could use to map */
-		page_size = kvm_host_page_size(kvm, gfn);
-
-		/* Make sure the page_size does not exceed the memslot */
-		while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
-			page_size >>= 1;
-
-		/* Make sure gfn is aligned to the page size we want to map */
-		while ((gfn << PAGE_SHIFT) & (page_size - 1))
-			page_size >>= 1;
-
-		/* Make sure hva is aligned to the page size we want to map */
-		while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
-			page_size >>= 1;
-
-		/*
-		 * Pin all pages we are about to map in memory. This is
-		 * important because we unmap and unpin in 4kb steps later.
-		 */
-		pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
-		if (is_error_noslot_pfn(pfn)) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Map into IO address space */
-		r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
-			      page_size, flags);
-		if (r) {
-			printk(KERN_ERR "kvm_iommu_map_address:"
-			       "iommu failed to map pfn=%llx\n", pfn);
-			kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
-			goto unmap_pages;
-		}
-
-		gfn += page_size >> PAGE_SHIFT;
-
-		cond_resched();
-	}
-
-	return 0;
-
-unmap_pages:
-	kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
-	return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-	int idx, r = 0;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_register_noncoherent_dma(kvm);
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots) {
-		r = kvm_iommu_map_pages(kvm, memslot);
-		if (r)
-			break;
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return r;
-}
-
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
-	bool noncoherent;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	r = iommu_attach_device(domain, &pdev->dev);
-	if (r) {
-		dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
-		return r;
-	}
-
-	noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
-
-	/* Check if need to update IOMMU page table for guest memory */
-	if (noncoherent != kvm->arch.iommu_noncoherent) {
-		kvm_iommu_unmap_memslots(kvm);
-		kvm->arch.iommu_noncoherent = noncoherent;
-		r = kvm_iommu_map_memslots(kvm);
-		if (r)
-			goto out_unmap;
-	}
-
-	kvm_arch_start_assignment(kvm);
-	pci_set_dev_assigned(pdev);
-
-	dev_info(&pdev->dev, "kvm assign device\n");
-
-	return 0;
-out_unmap:
-	kvm_iommu_unmap_memslots(kvm);
-	return r;
-}
-
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	iommu_detach_device(domain, &pdev->dev);
-
-	pci_clear_dev_assigned(pdev);
-	kvm_arch_end_assignment(kvm);
-
-	dev_info(&pdev->dev, "kvm deassign device\n");
-
-	return 0;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm)
-{
-	int r;
-
-	if (!iommu_present(&pci_bus_type)) {
-		printk(KERN_ERR "%s: iommu not found\n", __func__);
-		return -ENODEV;
-	}
-
-	mutex_lock(&kvm->slots_lock);
-
-	kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
-	if (!kvm->arch.iommu_domain) {
-		r = -ENOMEM;
-		goto out_unlock;
-	}
-
-	if (!allow_unsafe_assigned_interrupts &&
-	    !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
-		printk(KERN_WARNING "%s: No interrupt remapping support,"
-		       " disallowing device assignment."
-		       " Re-enable with \"allow_unsafe_assigned_interrupts=1\""
-		       " module option.\n", __func__);
-		iommu_domain_free(kvm->arch.iommu_domain);
-		kvm->arch.iommu_domain = NULL;
-		r = -EPERM;
-		goto out_unlock;
-	}
-
-	r = kvm_iommu_map_memslots(kvm);
-	if (r)
-		kvm_iommu_unmap_memslots(kvm);
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages)
-{
-	struct iommu_domain *domain;
-	gfn_t end_gfn, gfn;
-	kvm_pfn_t pfn;
-	u64 phys;
-
-	domain  = kvm->arch.iommu_domain;
-	end_gfn = base_gfn + npages;
-	gfn     = base_gfn;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return;
-
-	while (gfn < end_gfn) {
-		unsigned long unmap_pages;
-		size_t size;
-
-		/* Get physical address */
-		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-
-		if (!phys) {
-			gfn++;
-			continue;
-		}
-
-		pfn  = phys >> PAGE_SHIFT;
-
-		/* Unmap address from IO address space */
-		size       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
-		unmap_pages = 1ULL << get_order(size);
-
-		/* Unpin all pages we just unmapped to not leak any memory */
-		kvm_unpin_pages(kvm, pfn, unmap_pages);
-
-		gfn += unmap_pages;
-
-		cond_resched();
-	}
-}
-
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-	int idx;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots)
-		kvm_iommu_unmap_pages(kvm, memslot);
-
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_unregister_noncoherent_dma(kvm);
-
-	return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	mutex_lock(&kvm->slots_lock);
-	kvm_iommu_unmap_memslots(kvm);
-	kvm->arch.iommu_domain = NULL;
-	kvm->arch.iommu_noncoherent = false;
-	mutex_unlock(&kvm->slots_lock);
-
-	iommu_domain_free(domain);
-	return 0;
-}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ccbd45ecd41a..1853cda7f6d5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,7 +27,6 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
-#include "assigned-dev.h"
 #include "pmu.h"
 #include "hyperv.h"
 
@@ -2675,10 +2674,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
 	case KVM_CAP_IMMEDIATE_EXIT:
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_ASSIGN_DEV_IRQ:
-	case KVM_CAP_PCI_2_3:
-#endif
 		r = 1;
 		break;
 	case KVM_CAP_ADJUST_CLOCK:
@@ -2713,11 +2708,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
 		break;
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_IOMMU:
-		r = iommu_present(&pci_bus_type);
-		break;
-#endif
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
@@ -4230,7 +4220,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	default:
-		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+		r = -ENOTTY;
 	}
 out:
 	return r;
@@ -8068,7 +8058,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
-	kvm_free_all_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 }
 
@@ -8152,7 +8141,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	}
 	if (kvm_x86_ops->vm_destroy)
 		kvm_x86_ops->vm_destroy(kvm);
-	kvm_iommu_unmap_guest(kvm);
 	kvm_pic_destroy(kvm);
 	kvm_ioapic_destroy(kvm);
 	kvm_free_vcpus(kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d0250744507a..f1339a7756b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -877,22 +877,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-#else
-static inline int kvm_iommu_map_pages(struct kvm *kvm,
-				      struct kvm_memory_slot *slot)
-{
-	return 0;
-}
-
-static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
-					 struct kvm_memory_slot *slot)
-{
-}
-#endif
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 88257b311cb5..ff3bf5d26e0b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1019,8 +1019,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 		old_memslots = install_new_memslots(kvm, as_id, slots);
 
-		/* slot was deleted or moved, clear iommu mapping */
-		kvm_iommu_unmap_pages(kvm, &old);
 		/* From this point no new shadow pages pointing to a deleted,
 		 * or moved, memslot will be created.
 		 *
@@ -1055,21 +1053,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	kvm_free_memslot(kvm, &old, &new);
 	kvfree(old_memslots);
-
-	/*
-	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
-	 * un-mapped and re-mapped if their base changes.  Since base change
-	 * unmapping is handled above with slot deletion, mapping alone is
-	 * needed here.  Anything else the iommu might care about for existing
-	 * slots (size changes, userspace addr changes and read-only flag
-	 * changes) is disallowed above, so any other attribute changes getting
-	 * here can be skipped.
-	 */
-	if (as_id == 0 && (change == KVM_MR_CREATE || change == KVM_MR_MOVE)) {
-		r = kvm_iommu_map_pages(kvm, &new);
-		return r;
-	}
-
 	return 0;
 
 out_slots:
-- 
cgit v1.2.3-55-g7522


From d824ca52abd020a36948d12f2c6704ea2ae12513 Mon Sep 17 00:00:00 2001
From: Marc Zyngier
Date: Mon, 3 Apr 2017 19:38:07 +0100
Subject: arm/arm64: Add hyp-stub API documentation

In order to help people understanding the hyp-stub API that exists
between the host kernel and the hypervisor mode (whether a hypervisor
has been installed or not), let's document said API.

As with any form of documentation, I expect it to become obsolete
and completely misleading within 20 minutes after having being merged.

Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 Documentation/virtual/kvm/arm/hyp-abi.txt | 53 +++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 Documentation/virtual/kvm/arm/hyp-abi.txt

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/arm/hyp-abi.txt b/Documentation/virtual/kvm/arm/hyp-abi.txt
new file mode 100644
index 000000000000..a20a0bee268d
--- /dev/null
+++ b/Documentation/virtual/kvm/arm/hyp-abi.txt
@@ -0,0 +1,53 @@
+* Internal ABI between the kernel and HYP
+
+This file documents the interaction between the Linux kernel and the
+hypervisor layer when running Linux as a hypervisor (for example
+KVM). It doesn't cover the interaction of the kernel with the
+hypervisor when running as a guest (under Xen, KVM or any other
+hypervisor), or any hypervisor-specific interaction when the kernel is
+used as a host.
+
+On arm and arm64 (without VHE), the kernel doesn't run in hypervisor
+mode, but still needs to interact with it, allowing a built-in
+hypervisor to be either installed or torn down.
+
+In order to achieve this, the kernel must be booted at HYP (arm) or
+EL2 (arm64), allowing it to install a set of stubs before dropping to
+SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction,
+and only act on individual CPUs.
+
+Unless specified otherwise, any built-in hypervisor must implement
+these functions (see arch/arm{,64}/include/asm/virt.h):
+
+* r0/x0 = HVC_SET_VECTORS
+  r1/x1 = vectors
+
+  Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors'
+  must be a physical address, and respect the alignment requirements
+  of the architecture. Only implemented by the initial stubs, not by
+  Linux hypervisors.
+
+* r0/x0 = HVC_RESET_VECTORS
+
+  Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials
+  stubs' exception vector value. This effectively disables an existing
+  hypervisor.
+
+* r0/x0 = HVC_SOFT_RESTART
+  r1/x1 = restart address
+  x2 = x0's value when entering the next payload (arm64)
+  x3 = x1's value when entering the next payload (arm64)
+  x4 = x2's value when entering the next payload (arm64)
+
+  Mask all exceptions, disable the MMU, move the arguments into place
+  (arm64 only), and jump to the restart address while at HYP/EL2. This
+  hypercall is not expected to return to its caller.
+
+Any other value of r0/x0 triggers a hypervisor-specific handling,
+which is not documented here.
+
+The return value of a stub hypercall is held by r0/x0, and is 0 on
+success, and HVC_STUB_ERR on error. A stub hypercall is allowed to
+clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and
+ip on arm). It is thus recommended to use a function call to perform
+the hypercall.
-- 
cgit v1.2.3-55-g7522


From 3fe17e6826162021d5e9274949571b19fc94826b Mon Sep 17 00:00:00 2001
From: Alexander Graf
Date: Tue, 27 Sep 2016 21:08:05 +0200
Subject: KVM: arm/arm64: Add ARM user space interrupt signaling ABI

We have 2 modes for dealing with interrupts in the ARM world. We can
either handle them all using hardware acceleration through the vgic or
we can emulate a gic in user space and only drive CPU IRQ pins from
there.

Unfortunately, when driving IRQs from user space, we never tell user
space about events from devices emulated inside the kernel, which may
result in interrupt line state changes, so we lose out on for example
timer and PMU events if we run with user space gic emulation.

Define an ABI to publish such device output levels to userspace.

Reviewed-by: Alexander Graf <agraf@suse.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt | 42 +++++++++++++++++++++++++++++++++++++++
 arch/arm/include/uapi/asm/kvm.h   |  2 ++
 arch/arm64/include/uapi/asm/kvm.h |  2 ++
 include/uapi/linux/kvm.h          |  8 ++++++++
 4 files changed, 54 insertions(+)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3c248f772ae6..3b4e76e5201e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4147,3 +4147,45 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is
 available, means that that the kernel can support guests using the
 hashed page table MMU defined in Power ISA V3.00 (as implemented in
 the POWER9 processor), including in-memory segment tables.
+
+
+8.5 KVM_CAP_ARM_USER_IRQ
+
+Architectures: arm, arm64
+This capability, if KVM_CHECK_EXTENSION indicates that it is available, means
+that if userspace creates a VM without an in-kernel interrupt controller, it
+will be notified of changes to the output level of in-kernel emulated devices,
+which can generate virtual interrupts, presented to the VM.
+For such VMs, on every return to userspace, the kernel
+updates the vcpu's run->s.regs.device_irq_level field to represent the actual
+output level of the device.
+
+Whenever kvm detects a change in the device output level, kvm guarantees at
+least one return to userspace before running the VM.  This exit could either
+be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way,
+userspace can always sample the device output level and re-compute the state of
+the userspace interrupt controller.  Userspace should always check the state
+of run->s.regs.device_irq_level on every kvm exit.
+The value in run->s.regs.device_irq_level can represent both level and edge
+triggered interrupt signals, depending on the device.  Edge triggered interrupt
+signals will exit to userspace with the bit in run->s.regs.device_irq_level
+set exactly once per edge signal.
+
+The field run->s.regs.device_irq_level is available independent of
+run->kvm_valid_regs or run->kvm_dirty_regs bits.
+
+If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a
+number larger than 0 indicating the version of this capability is implemented
+and thereby which bits in in run->s.regs.device_irq_level can signal values.
+
+Currently the following bits are defined for the device_irq_level bitmap:
+
+  KVM_CAP_ARM_USER_IRQ >= 1:
+
+    KVM_ARM_DEV_EL1_VTIMER -  EL1 virtual timer
+    KVM_ARM_DEV_EL1_PTIMER -  EL1 physical timer
+    KVM_ARM_DEV_PMU        -  ARM PMU overflow interrupt signal
+
+Future versions of kvm may implement additional events. These will get
+indicated by returning a higher number from KVM_CHECK_EXTENSION and will be
+listed above.
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 6ebd3e6a1fd1..a5838d605e7b 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -114,6 +114,8 @@ struct kvm_debug_exit_arch {
 };
 
 struct kvm_sync_regs {
+	/* Used with KVM_CAP_ARM_USER_IRQ */
+	__u64 device_irq_level;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index c2860358ae3e..cd6bea495e63 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -143,6 +143,8 @@ struct kvm_debug_exit_arch {
 #define KVM_GUESTDBG_USE_HW		(1 << 17)
 
 struct kvm_sync_regs {
+	/* Used with KVM_CAP_ARM_USER_IRQ */
+	__u64 device_irq_level;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f51d5082a377..6d6b9b237f0b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -883,6 +883,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_ARM_USER_IRQ 137
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1354,4 +1355,11 @@ struct kvm_assigned_msix_entry {
 #define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
 #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
 
+/* Available with KVM_CAP_ARM_USER_IRQ */
+
+/* Bits for run->s.regs.device_irq_level */
+#define KVM_ARM_DEV_EL1_VTIMER		(1 << 0)
+#define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
+#define KVM_ARM_DEV_PMU			(1 << 2)
+
 #endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3-55-g7522


From 121f80ba68f1a5779a36d7b3247206e60e0a7418 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy
Date: Wed, 22 Mar 2017 15:21:56 +1100
Subject: KVM: PPC: VFIO: Add in-kernel acceleration for VFIO

This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests targeted an IOMMU TCE table used for VFIO
without passing them to user space which saves time on switching
to user space and back.

This adds H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE handlers to KVM.
KVM tries to handle a TCE request in the real mode, if failed
it passes the request to the virtual mode to complete the operation.
If it a virtual mode handler fails, the request is passed to
the user space; this is not expected to happen though.

To avoid dealing with page use counters (which is tricky in real mode),
this only accelerates SPAPR TCE IOMMU v2 clients which are required
to pre-register the userspace memory. The very first TCE request will
be handled in the VFIO SPAPR TCE driver anyway as the userspace view
of the TCE table (iommu_table::it_userspace) is not allocated till
the very first mapping happens and we cannot call vmalloc in real mode.

If we fail to update a hardware IOMMU table unexpected reason, we just
clear it and move on as there is nothing really we can do about it -
for example, if we hot plug a VFIO device to a guest, existing TCE tables
will be mirrored automatically to the hardware and there is no interface
to report to the guest about possible failures.

This adds new attribute - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE - to
the VFIO KVM device. It takes a VFIO group fd and SPAPR TCE table fd
and associates a physical IOMMU table with the SPAPR TCE table (which
is a guest view of the hardware IOMMU table). The iommu_table object
is cached and referenced so we do not have to look up for it in real mode.

This does not implement the UNSET counterpart as there is no use for it -
once the acceleration is enabled, the existing userspace won't
disable it unless a VFIO container is destroyed; this adds necessary
cleanup to the KVM_DEV_VFIO_GROUP_DEL handler.

This advertises the new KVM_CAP_SPAPR_TCE_VFIO capability to the user
space.

This adds real mode version of WARN_ON_ONCE() as the generic version
causes problems with rcu_sched. Since we testing what vmalloc_to_phys()
returns in the code, this also adds a check for already existing
vmalloc_to_phys() call in kvmppc_rm_h_put_tce_indirect().

This finally makes use of vfio_external_user_iommu_id() which was
introduced quite some time ago and was considered for removal.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/devices/vfio.txt |  18 +-
 arch/powerpc/include/asm/kvm_host.h        |   8 +
 arch/powerpc/include/asm/kvm_ppc.h         |   4 +
 arch/powerpc/kvm/book3s_64_vio.c           | 306 ++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_64_vio_hv.c        | 201 ++++++++++++++++++-
 arch/powerpc/kvm/powerpc.c                 |   2 +
 include/uapi/linux/kvm.h                   |   6 +
 virt/kvm/vfio.c                            | 105 ++++++++++
 8 files changed, 645 insertions(+), 5 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt
index ef51740c67ca..528c77c8022c 100644
--- a/Documentation/virtual/kvm/devices/vfio.txt
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -16,7 +16,21 @@ Groups:
 
 KVM_DEV_VFIO_GROUP attributes:
   KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+	kvm_device_attr.addr points to an int32_t file descriptor
+	for the VFIO group.
   KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+	kvm_device_attr.addr points to an int32_t file descriptor
+	for the VFIO group.
+  KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table
+	allocated by sPAPR KVM.
+	kvm_device_attr.addr points to a struct:
 
-For each, kvm_device_attr.addr points to an int32_t file descriptor
-for the VFIO group.
+	struct kvm_vfio_spapr_tce {
+		__s32	groupfd;
+		__s32	tablefd;
+	};
+
+	where
+	@groupfd is a file descriptor for a VFIO group;
+	@tablefd is a file descriptor for a TCE table allocated via
+		KVM_CREATE_SPAPR_TCE.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0f3ac09cbfe0..77c60826d145 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,6 +188,13 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_iommu_table {
+	struct rcu_head rcu;
+	struct list_head next;
+	struct iommu_table *tbl;
+	struct kref kref;
+};
+
 struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
@@ -196,6 +203,7 @@ struct kvmppc_spapr_tce_table {
 	u32 page_shift;
 	u64 offset;		/* in pages */
 	u64 size;		/* window size in pages */
+	struct list_head iommu_tables;
 	struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4d079a29eae2..5885d327c025 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -173,6 +173,10 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp);
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index d507d94e020c..a160c14304eb 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -28,6 +28,8 @@
 #include <linux/hugetlb.h>
 #include <linux/list.h>
 #include <linux/anon_inodes.h>
+#include <linux/iommu.h>
+#include <linux/file.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -40,6 +42,7 @@
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
 {
@@ -91,6 +94,137 @@ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
 	return ret;
 }
 
+static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
+			struct kvmppc_spapr_tce_iommu_table, rcu);
+
+	iommu_tce_table_put(stit->tbl);
+
+	kfree(stit);
+}
+
+static void kvm_spapr_tce_liobn_put(struct kref *kref)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref,
+			struct kvmppc_spapr_tce_iommu_table, kref);
+
+	list_del_rcu(&stit->next);
+
+	call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free);
+}
+
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp)
+{
+	int i;
+	struct kvmppc_spapr_tce_table *stt;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+	struct iommu_table_group *table_group = NULL;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+
+		table_group = iommu_group_get_iommudata(grp);
+		if (WARN_ON(!table_group))
+			continue;
+
+		list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+			for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+				if (table_group->tables[i] != stit->tbl)
+					continue;
+
+				kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
+				return;
+			}
+		}
+	}
+}
+
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	bool found = false;
+	struct iommu_table *tbl = NULL;
+	struct iommu_table_group *table_group;
+	long i;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	struct fd f;
+
+	f = fdget(tablefd);
+	if (!f.file)
+		return -EBADF;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt == f.file->private_data) {
+			found = true;
+			break;
+		}
+	}
+
+	fdput(f);
+
+	if (!found)
+		return -EINVAL;
+
+	table_group = iommu_group_get_iommudata(grp);
+	if (WARN_ON(!table_group))
+		return -EFAULT;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbltmp = table_group->tables[i];
+
+		if (!tbltmp)
+			continue;
+		/*
+		 * Make sure hardware table parameters are exactly the same;
+		 * this is used in the TCE handlers where boundary checks
+		 * use only the first attached table.
+		 */
+		if ((tbltmp->it_page_shift == stt->page_shift) &&
+				(tbltmp->it_offset == stt->offset) &&
+				(tbltmp->it_size == stt->size)) {
+			/*
+			 * Reference the table to avoid races with
+			 * add/remove DMA windows.
+			 */
+			tbl = iommu_tce_table_get(tbltmp);
+			break;
+		}
+	}
+	if (!tbl)
+		return -EINVAL;
+
+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+		if (tbl != stit->tbl)
+			continue;
+
+		if (!kref_get_unless_zero(&stit->kref)) {
+			/* stit is being destroyed */
+			iommu_tce_table_put(tbl);
+			return -ENOTTY;
+		}
+		/*
+		 * The table is already known to this KVM, we just increased
+		 * its KVM reference counter and can return.
+		 */
+		return 0;
+	}
+
+	stit = kzalloc(sizeof(*stit), GFP_KERNEL);
+	if (!stit) {
+		iommu_tce_table_put(tbl);
+		return -ENOMEM;
+	}
+
+	stit->tbl = tbl;
+	kref_init(&stit->kref);
+
+	list_add_rcu(&stit->next, &stt->iommu_tables);
+
+	return 0;
+}
+
 static void release_spapr_tce_table(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_table *stt = container_of(head,
@@ -130,9 +264,18 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
 static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
 
 	list_del_rcu(&stt->list);
 
+	list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+		WARN_ON(!kref_read(&stit->kref));
+		while (1) {
+			if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put))
+				break;
+		}
+	}
+
 	kvm_put_kvm(stt->kvm);
 
 	kvmppc_account_memlimit(
@@ -183,6 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	stt->offset = args->offset;
 	stt->size = size;
 	stt->kvm = kvm;
+	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
 
 	for (i = 0; i < npages; i++) {
 		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -211,11 +355,101 @@ fail:
 	return ret;
 }
 
+static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
+		return H_HARDWARE;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret != H_SUCCESS)
+		iommu_tce_xchg(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	if (mm_iommu_mapped_inc(mem))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	if (WARN_ON_ONCE(ret)) {
+		mm_iommu_mapped_dec(mem);
+		return H_HARDWARE;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
 	struct kvmppc_spapr_tce_table *stt;
-	long ret;
+	long ret, idx;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
@@ -232,7 +466,35 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		} else {
+			idx = srcu_read_lock(&vcpu->kvm->srcu);
+			ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl,
+					entry, ua, dir);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		}
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE(1);
+		kvmppc_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -247,6 +509,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long entry, ua = 0;
 	u64 __user *tces;
 	u64 tce;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -285,6 +548,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
@@ -301,6 +584,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -314,6 +598,24 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 440d3ab5dc32..eda0a8f6fae8 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -40,6 +40,31 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 
+#ifdef CONFIG_BUG
+
+#define WARN_ON_ONCE_RM(condition)	({			\
+	static bool __section(.data.unlikely) __warned;		\
+	int __ret_warn_once = !!(condition);			\
+								\
+	if (unlikely(__ret_warn_once && !__warned)) {		\
+		__warned = true;				\
+		pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n",	\
+				__stringify(condition),		\
+				__func__, __LINE__);		\
+		dump_stack();					\
+	}							\
+	unlikely(__ret_warn_once);				\
+})
+
+#else
+
+#define WARN_ON_ONCE_RM(condition) ({				\
+	int __ret_warn_on = !!(condition);			\
+	unlikely(__ret_warn_on);				\
+})
+
+#endif
+
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
 
 /*
@@ -161,11 +186,117 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir))
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret)
+		iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa = 0;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+	if (ret) {
+		mm_iommu_mapped_dec(mem);
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		unsigned long ioba, unsigned long tce)
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
@@ -182,7 +313,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE)
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		else
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry, ua, dir);
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE_RM(1);
+		kvmppc_rm_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -223,6 +379,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
 	bool prereg = false;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -270,6 +427,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 			return H_TOO_HARD;
 
 		rmap = (void *) vmalloc_to_phys(rmap);
+		if (WARN_ON_ONCE_RM(!rmap))
+			return H_HARDWARE;
 
 		/*
 		 * Synchronize with the MMU notifier callbacks in
@@ -293,6 +452,27 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		ua = 0;
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
@@ -309,6 +489,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
 	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
@@ -322,6 +503,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6c7244879bfe..cf725c580fc5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -534,6 +534,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_SPAPR_TCE_64:
+		/* fallthrough */
+	case KVM_CAP_SPAPR_TCE_VFIO:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
 	case KVM_CAP_PPC_ENABLE_HCALL:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7b488eae61b8..3c168b6fd74b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1097,6 +1097,7 @@ struct kvm_device_attr {
 #define  KVM_DEV_VFIO_GROUP			1
 #define   KVM_DEV_VFIO_GROUP_ADD			1
 #define   KVM_DEV_VFIO_GROUP_DEL			2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE		3
 
 enum kvm_device_type {
 	KVM_DEV_TYPE_FSL_MPIC_20	= 1,
@@ -1118,6 +1119,11 @@ enum kvm_device_type {
 	KVM_DEV_TYPE_MAX,
 };
 
+struct kvm_vfio_spapr_tce {
+	__s32	groupfd;
+	__s32	tablefd;
+};
+
 /*
  * ioctls for VM fds
  */
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index d32f239eb471..37d9118fd84b 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -20,6 +20,10 @@
 #include <linux/vfio.h>
 #include "vfio.h"
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+#include <asm/kvm_ppc.h>
+#endif
+
 struct kvm_vfio_group {
 	struct list_head node;
 	struct vfio_group *vfio_group;
@@ -89,6 +93,47 @@ static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
 	return ret > 0;
 }
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group)
+{
+	int (*fn)(struct vfio_group *);
+	int ret = -EINVAL;
+
+	fn = symbol_get(vfio_external_user_iommu_id);
+	if (!fn)
+		return ret;
+
+	ret = fn(vfio_group);
+
+	symbol_put(vfio_external_user_iommu_id);
+
+	return ret;
+}
+
+static struct iommu_group *kvm_vfio_group_get_iommu_group(
+		struct vfio_group *group)
+{
+	int group_id = kvm_vfio_external_user_iommu_id(group);
+
+	if (group_id < 0)
+		return NULL;
+
+	return iommu_group_get_by_id(group_id);
+}
+
+static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm,
+		struct vfio_group *vfio_group)
+{
+	struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group);
+
+	if (WARN_ON_ONCE(!grp))
+		return;
+
+	kvm_spapr_tce_release_iommu_group(kvm, grp);
+	iommu_group_put(grp);
+}
+#endif
+
 /*
  * Groups can use the same or different IOMMU domains.  If the same then
  * adding a new group may change the coherency of groups we've previously
@@ -211,6 +256,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 
 		mutex_unlock(&kv->lock);
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		kvm_spapr_tce_release_vfio_group(dev->kvm, vfio_group);
+#endif
 		kvm_vfio_group_set_kvm(vfio_group, NULL);
 
 		kvm_vfio_group_put_external_user(vfio_group);
@@ -218,6 +266,57 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 		kvm_vfio_update_coherency(dev);
 
 		return ret;
+
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: {
+		struct kvm_vfio_spapr_tce param;
+		struct kvm_vfio *kv = dev->private;
+		struct vfio_group *vfio_group;
+		struct kvm_vfio_group *kvg;
+		struct fd f;
+		struct iommu_group *grp;
+
+		if (copy_from_user(&param, (void __user *)arg,
+				sizeof(struct kvm_vfio_spapr_tce)))
+			return -EFAULT;
+
+		f = fdget(param.groupfd);
+		if (!f.file)
+			return -EBADF;
+
+		vfio_group = kvm_vfio_group_get_external_user(f.file);
+		fdput(f);
+
+		if (IS_ERR(vfio_group))
+			return PTR_ERR(vfio_group);
+
+		grp = kvm_vfio_group_get_iommu_group(vfio_group);
+		if (WARN_ON_ONCE(!grp)) {
+			kvm_vfio_group_put_external_user(vfio_group);
+			return -EIO;
+		}
+
+		ret = -ENOENT;
+
+		mutex_lock(&kv->lock);
+
+		list_for_each_entry(kvg, &kv->group_list, node) {
+			if (kvg->vfio_group != vfio_group)
+				continue;
+
+			ret = kvm_spapr_tce_attach_iommu_group(dev->kvm,
+					param.tablefd, grp);
+			break;
+		}
+
+		mutex_unlock(&kv->lock);
+
+		iommu_group_put(grp);
+		kvm_vfio_group_put_external_user(vfio_group);
+
+		return ret;
+	}
+#endif /* CONFIG_SPAPR_TCE_IOMMU */
 	}
 
 	return -ENXIO;
@@ -242,6 +341,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
 		switch (attr->attr) {
 		case KVM_DEV_VFIO_GROUP_ADD:
 		case KVM_DEV_VFIO_GROUP_DEL:
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
+#endif
 			return 0;
 		}
 
@@ -257,6 +359,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
 	struct kvm_vfio_group *kvg, *tmp;
 
 	list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+		kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group);
+#endif
 		kvm_vfio_group_set_kvm(kvg->vfio_group, NULL);
 		kvm_vfio_group_put_external_user(kvg->vfio_group);
 		list_del(&kvg->node);
-- 
cgit v1.2.3-55-g7522


From 668fffa3f838edfcb1679f842f7ef1afa61c3e9a Mon Sep 17 00:00:00 2001
From: Michael S. Tsirkin
Date: Fri, 21 Apr 2017 12:27:17 +0200
Subject: kvm: better MWAIT emulation for guests

Guests that are heavy on futexes end up IPI'ing each other a lot. That
can lead to significant slowdowns and latency increase for those guests
when running within KVM.

If only a single guest is needed on a host, we have a lot of spare host
CPU time we can throw at the problem. Modern CPUs implement a feature
called "MWAIT" which allows guests to wake up sleeping remote CPUs without
an IPI - thus without an exit - at the expense of never going out of guest
context.

The decision whether this is something sensible to use should be up to the
VM admin, so to user space. We can however allow MWAIT execution on systems
that support it properly hardware wise.

This patch adds a CAP to user space and a KVM cpuid leaf to indicate
availability of native MWAIT execution. With that enabled, the worst a
guest can do is waste as many cycles as a "jmp ." would do, so it's not
a privilege problem.

We consciously do *not* expose the feature in our CPUID bitmap, as most
people will want to benefit from sleeping vCPUs to allow for over commit.

Reported-by: "Gabriel L. Somlo" <gsomlo@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
[agraf: fix amd, change commit message]
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  9 +++++++++
 arch/x86/kvm/svm.c                |  7 +++++--
 arch/x86/kvm/vmx.c                |  6 ++++--
 arch/x86/kvm/x86.c                |  3 +++
 arch/x86/kvm/x86.h                | 36 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 6 files changed, 58 insertions(+), 4 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e60be91d8036..dc674c2b8b31 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4111,3 +4111,12 @@ reserved.
  2: MIPS64 or microMIPS64 with access to all address segments.
     Both registers and addresses are 64-bits wide.
     It will be possible to run 64-bit or 32-bit guest code.
+
+8.8 KVM_CAP_X86_GUEST_MWAIT
+
+Architectures: x86
+
+This capability indicates that guest using memory monotoring instructions
+(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit.  As such time
+spent while virtual CPU is halted in this way will then be accounted for as
+guest running time on the host (as opposed to e.g. HLT).
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1b203abf76e1..c41f03e5090a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1198,10 +1198,13 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_intercept(svm, INTERCEPT_CLGI);
 	set_intercept(svm, INTERCEPT_SKINIT);
 	set_intercept(svm, INTERCEPT_WBINVD);
-	set_intercept(svm, INTERCEPT_MONITOR);
-	set_intercept(svm, INTERCEPT_MWAIT);
 	set_intercept(svm, INTERCEPT_XSETBV);
 
+	if (!kvm_mwait_in_guest()) {
+		set_intercept(svm, INTERCEPT_MONITOR);
+		set_intercept(svm, INTERCEPT_MWAIT);
+	}
+
 	control->iopm_base_pa = iopm_base;
 	control->msrpm_base_pa = __pa(svm->msrpm);
 	control->int_ctl = V_INTR_MASKING_MASK;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c1a12b94e1fd..a4ef63718101 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3527,11 +3527,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING |
-	      CPU_BASED_MWAIT_EXITING |
-	      CPU_BASED_MONITOR_EXITING |
 	      CPU_BASED_INVLPG_EXITING |
 	      CPU_BASED_RDPMC_EXITING;
 
+	if (!kvm_mwait_in_guest())
+		min |= CPU_BASED_MWAIT_EXITING |
+			CPU_BASED_MONITOR_EXITING;
+
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49a69c0a0d50..2f9fe6bf7091 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2687,6 +2687,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ADJUST_CLOCK:
 		r = KVM_CLOCK_TSC_STABLE;
 		break;
+	case KVM_CAP_X86_GUEST_MWAIT:
+		r = kvm_mwait_in_guest();
+		break;
 	case KVM_CAP_X86_SMM:
 		/* SMBASE is usually relocated above 1M on modern chipsets,
 		 * and SMM handlers might indeed rely on 4G segment limits,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e8ff3e4ce38a..612067074905 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -1,6 +1,8 @@
 #ifndef ARCH_X86_KVM_X86_H
 #define ARCH_X86_KVM_X86_H
 
+#include <asm/processor.h>
+#include <asm/mwait.h>
 #include <linux/kvm_host.h>
 #include <asm/pvclock.h>
 #include "kvm_cache_regs.h"
@@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 	    __rem;						\
 	 })
 
+static inline bool kvm_mwait_in_guest(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
+		return false;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		/* All AMD CPUs have a working MWAIT implementation */
+		return true;
+	case X86_VENDOR_INTEL:
+		/* Handle Intel below */
+		break;
+	default:
+		return false;
+	}
+
+	/*
+	 * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as
+	 * they would allow guest to stop the CPU completely by disabling
+	 * interrupts then invoking MWAIT.
+	 */
+	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return false;
+
+	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+	if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK))
+		return false;
+
+	return true;
+}
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3c168b6fd74b..e43906b95d9f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -893,6 +893,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_GS 140
 #define KVM_CAP_S390_AIS 141
 #define KVM_CAP_SPAPR_TCE_VFIO 142
+#define KVM_CAP_X86_GUEST_MWAIT 143
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-55-g7522


From e000b8e0968dd7bfa09c444607ce1e48e57aafd3 Mon Sep 17 00:00:00 2001
From: Jason J. Herne
Date: Mon, 20 Mar 2017 09:57:42 -0400
Subject: s390: kvm: Cpu model support for msa6, msa7 and msa8

msa6 and msa7 require no changes.
msa8 adds kma instruction and feature area.

Signed-off-by: Jason J. Herne <jjherne@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 Documentation/virtual/kvm/devices/vm.txt | 3 ++-
 arch/s390/include/uapi/asm/kvm.h         | 3 ++-
 arch/s390/kvm/kvm-s390.c                 | 4 ++++
 arch/s390/tools/gen_facilities.c         | 1 +
 tools/arch/s390/include/uapi/asm/kvm.h   | 3 ++-
 5 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index b6cda49f2ba4..575ccb022aac 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -140,7 +140,8 @@ struct kvm_s390_vm_cpu_subfunc {
        u8 kmo[16];           # valid with Message-Security-Assist-Extension 4
        u8 pcc[16];           # valid with Message-Security-Assist-Extension 4
        u8 ppno[16];          # valid with Message-Security-Assist-Extension 5
-       u8 reserved[1824];    # reserved for future instructions
+       u8 kma[16];           # valid with Message-Security-Assist-Extension 8
+       u8 reserved[1808];    # reserved for future instructions
 };
 
 Parameters: address of a buffer to load the subfunction blocks from.
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index bf9267930939..3dd2a1d308dd 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -141,7 +141,8 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 kmo[16];		/* with MSA4 */
 	__u8 pcc[16];		/* with MSA4 */
 	__u8 ppno[16];		/* with MSA5 */
-	__u8 reserved[1824];
+	__u8 kma[16];		/* with MSA8 */
+	__u8 reserved[1808];
 };
 
 /* kvm attributes for crypto */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8771fef112a1..7eb1275cc265 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -276,6 +276,10 @@ static void kvm_s390_cpu_feat_init(void)
 		__cpacf_query(CPACF_PPNO, (cpacf_mask_t *)
 			      kvm_s390_available_subfunc.ppno);
 
+	if (test_facility(146)) /* MSA8 */
+		__cpacf_query(CPACF_KMA, (cpacf_mask_t *)
+			      kvm_s390_available_subfunc.kma);
+
 	if (MACHINE_HAS_ESOP)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
 	/*
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 0cf802de52a1..be63fbd699fd 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -82,6 +82,7 @@ static struct facility_def facility_defs[] = {
 			78, /* enhanced-DAT 2 */
 			130, /* instruction-execution-protection */
 			131, /* enhanced-SOP 2 and side-effect */
+			146, /* msa extension 8 */
 			-1  /* END */
 		}
 	},
diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h
index a2ffec4139ad..7f4fd65e9208 100644
--- a/tools/arch/s390/include/uapi/asm/kvm.h
+++ b/tools/arch/s390/include/uapi/asm/kvm.h
@@ -131,7 +131,8 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 kmo[16];		/* with MSA4 */
 	__u8 pcc[16];		/* with MSA4 */
 	__u8 ppno[16];		/* with MSA5 */
-	__u8 reserved[1824];
+	__u8 kma[16];		/* with MSA8 */
+	__u8 reserved[1808];
 };
 
 /* kvm attributes for crypto */
-- 
cgit v1.2.3-55-g7522


From bcb85c887647c0f096b51ac2ee391ea792b14b9e Mon Sep 17 00:00:00 2001
From: Jann Horn
Date: Mon, 24 Apr 2017 11:16:49 +0200
Subject: KVM: Documentation: remove VM mmap documentation

Since commit 80f5b5e700fa9c ("KVM: remove vm mmap method"), the VM mmap
handler is gone. Remove the corresponding documentation.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'Documentation/virtual')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f038f8cafa70..4029943887a3 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -110,11 +110,7 @@ Type: system ioctl
 Parameters: machine type identifier (KVM_VM_*)
 Returns: a VM fd that can be used to control the new virtual machine.
 
-The new VM has no virtual cpus and no memory.  An mmap() of a VM fd
-will access the virtual machine's physical address space; offset zero
-corresponds to guest physical address zero.  Use of mmap() on a VM fd
-is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
-available.
+The new VM has no virtual cpus and no memory.
 You probably want to use 0 as machine type.
 
 In order to create user controlled virtual machines on S390, check
-- 
cgit v1.2.3-55-g7522