summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGleb Natapov2013-04-28 11:50:07 +0200
committerGleb Natapov2013-04-28 11:50:07 +0200
commit064d1afaa5a60fc391d0b4b77599fc8f63f99cd3 (patch)
tree2e640cdfa50b0048c52e021f07a8b24560251b26
parentKVM: x86: Rework request for immediate exit (diff)
parentKVM: PPC: Book3S: Facilities to save/restore XICS presentation ctrler state (diff)
downloadkernel-qcow2-linux-064d1afaa5a60fc391d0b4b77599fc8f63f99cd3.tar.gz
kernel-qcow2-linux-064d1afaa5a60fc391d0b4b77599fc8f63f99cd3.tar.xz
kernel-qcow2-linux-064d1afaa5a60fc391d0b4b77599fc8f63f99cd3.zip
Merge git://github.com/agraf/linux-2.6.git kvm-ppc-next into queue
-rw-r--r--Documentation/virtual/kvm/api.txt114
-rw-r--r--Documentation/virtual/kvm/devices/README1
-rw-r--r--Documentation/virtual/kvm/devices/mpic.txt56
-rw-r--r--arch/ia64/include/asm/kvm_host.h1
-rw-r--r--arch/ia64/kvm/Kconfig1
-rw-r--r--arch/ia64/kvm/Makefile2
-rw-r--r--arch/powerpc/include/asm/hvcall.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h5
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h13
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h8
-rw-r--r--arch/powerpc/include/asm/kvm_host.h40
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h107
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h73
-rw-r--r--arch/powerpc/kernel/asm-offsets.c3
-rw-r--r--arch/powerpc/kvm/44x.c12
-rw-r--r--arch/powerpc/kvm/Kconfig26
-rw-r--r--arch/powerpc/kvm/Makefile12
-rw-r--r--arch/powerpc/kvm/book3s.c27
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c120
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c88
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c11
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c406
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S228
-rw-r--r--arch/powerpc/kvm/book3s_pr.c5
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c21
-rw-r--r--arch/powerpc/kvm/book3s_rtas.c274
-rw-r--r--arch/powerpc/kvm/book3s_xics.c1130
-rw-r--r--arch/powerpc/kvm/book3s_xics.h129
-rw-r--r--arch/powerpc/kvm/booke.c123
-rw-r--r--arch/powerpc/kvm/e500.c14
-rw-r--r--arch/powerpc/kvm/e500.h22
-rw-r--r--arch/powerpc/kvm/e500_emulate.c19
-rw-r--r--arch/powerpc/kvm/e500_mmu.c192
-rw-r--r--arch/powerpc/kvm/e500mc.c16
-rw-r--r--arch/powerpc/kvm/irq.h17
-rw-r--r--arch/powerpc/kvm/mpic.c1843
-rw-r--r--arch/powerpc/kvm/powerpc.c72
-rw-r--r--arch/powerpc/sysdev/xics/icp-native.c8
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/x86.c1
-rw-r--r--include/linux/kvm_host.h54
-rw-r--r--include/trace/events/kvm.h12
-rw-r--r--include/uapi/linux/kvm.h36
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/assigned-dev.c30
-rw-r--r--virt/kvm/eventfd.c6
-rw-r--r--virt/kvm/irq_comm.c194
-rw-r--r--virt/kvm/irqchip.c237
-rw-r--r--virt/kvm/kvm_main.c173
53 files changed, 5549 insertions, 449 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 976eb650e7ef..c09d1832e935 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1792,6 +1792,23 @@ registers, find a list below:
PPC | KVM_REG_PPC_TSR | 32
PPC | KVM_REG_PPC_OR_TSR | 32
PPC | KVM_REG_PPC_CLEAR_TSR | 32
+ PPC | KVM_REG_PPC_MAS0 | 32
+ PPC | KVM_REG_PPC_MAS1 | 32
+ PPC | KVM_REG_PPC_MAS2 | 64
+ PPC | KVM_REG_PPC_MAS7_3 | 64
+ PPC | KVM_REG_PPC_MAS4 | 32
+ PPC | KVM_REG_PPC_MAS6 | 32
+ PPC | KVM_REG_PPC_MMUCFG | 32
+ PPC | KVM_REG_PPC_TLB0CFG | 32
+ PPC | KVM_REG_PPC_TLB1CFG | 32
+ PPC | KVM_REG_PPC_TLB2CFG | 32
+ PPC | KVM_REG_PPC_TLB3CFG | 32
+ PPC | KVM_REG_PPC_TLB0PS | 32
+ PPC | KVM_REG_PPC_TLB1PS | 32
+ PPC | KVM_REG_PPC_TLB2PS | 32
+ PPC | KVM_REG_PPC_TLB3PS | 32
+ PPC | KVM_REG_PPC_EPTCFG | 32
+ PPC | KVM_REG_PPC_ICP_STATE | 64
ARM registers are mapped using the lower 32 bits. The upper 16 of that
is the register group type, or coprocessor number:
@@ -2173,6 +2190,76 @@ header; first `n_valid' valid entries with contents from the data
written, then `n_invalid' invalid entries, invalidating any previously
valid entries found.
+4.79 KVM_CREATE_DEVICE
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: vm ioctl
+Parameters: struct kvm_create_device (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+ ENODEV: The device type is unknown or unsupported
+ EEXIST: Device already created, and this type of device may not
+ be instantiated multiple times
+
+ Other error conditions may be defined by individual device types or
+ have their standard meanings.
+
+Creates an emulated device in the kernel. The file descriptor returned
+in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
+
+If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
+device type is supported (not necessarily whether it can be created
+in the current vm).
+
+Individual devices should not define flags. Attributes should be used
+for specifying any behavior that is not implied by the device type
+number.
+
+struct kvm_create_device {
+ __u32 type; /* in: KVM_DEV_TYPE_xxx */
+ __u32 fd; /* out: device handle */
+ __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */
+};
+
+4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+ ENXIO: The group or attribute is unknown/unsupported for this device
+ EPERM: The attribute cannot (currently) be accessed this way
+ (e.g. read-only attribute, or attribute that only makes
+ sense when the device is in a different state)
+
+ Other error conditions may be defined by individual device types.
+
+Gets/sets a specified piece of device configuration and/or state. The
+semantics are device-specific. See individual device documentation in
+the "devices" directory. As with ONE_REG, the size of the data
+transferred is defined by the particular attribute.
+
+struct kvm_device_attr {
+ __u32 flags; /* no flags currently defined */
+ __u32 group; /* device-defined */
+ __u64 attr; /* group-defined */
+ __u64 addr; /* userspace address of attr data */
+};
+
+4.81 KVM_HAS_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+ ENXIO: The group or attribute is unknown/unsupported for this device
+
+Tests whether a device supports a particular attribute. A successful
+return indicates the attribute is implemented. It does not necessarily
+indicate that the attribute can be read or written in the device's
+current state. "addr" is ignored.
4.77 KVM_ARM_VCPU_INIT
@@ -2255,6 +2342,25 @@ and distributor interface, the ioctl must be called after calling
KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling
this ioctl twice for any of the base addresses will return -EEXIST.
+4.82 KVM_PPC_RTAS_DEFINE_TOKEN
+
+Capability: KVM_CAP_PPC_RTAS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_rtas_token_args
+Returns: 0 on success, -1 on error
+
+Defines a token value for a RTAS (Run Time Abstraction Services)
+service in order to allow it to be handled in the kernel. The
+argument struct gives the name of the service, which must be the name
+of a service that has a kernel-side implementation. If the token
+value is non-zero, it will be associated with that service, and
+subsequent RTAS calls by the guest specifying that token will be
+handled by the kernel. If the token value is 0, then any token
+associated with the service will be forgotten, and subsequent RTAS
+calls by the guest for that service will be passed to userspace to be
+handled.
+
5. The kvm_run structure
------------------------
@@ -2658,3 +2764,11 @@ to receive the topmost interrupt vector.
When disabled (args[0] == 0), behavior is as if this facility is unsupported.
When this capability is enabled, KVM_EXIT_EPR can occur.
+
+6.6 KVM_CAP_IRQ_MPIC
+
+Architectures: ppc
+Parameters: args[0] is the MPIC device fd
+ args[1] is the MPIC CPU number for this vcpu
+
+This capability connects the vcpu to an in-kernel MPIC device.
diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
new file mode 100644
index 000000000000..34a69834124a
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/README
@@ -0,0 +1 @@
+This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt
new file mode 100644
index 000000000000..ad0ac778f20a
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/mpic.txt
@@ -0,0 +1,56 @@
+MPIC interrupt controller
+=========================
+
+Device types supported:
+ KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0
+ KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2
+
+Only one MPIC instance, of any type, may be instantiated. The created
+MPIC will act as the system interrupt controller, connecting to each
+vcpu's interrupt inputs.
+
+Groups:
+ KVM_DEV_MPIC_GRP_MISC
+ Attributes:
+ KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit)
+ Base address of the 256 KiB MPIC register space. Must be
+ naturally aligned. A value of zero disables the mapping.
+ Reset value is zero.
+
+ KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit)
+ Access an MPIC register, as if the access were made from the guest.
+ "attr" is the byte offset into the MPIC register space. Accesses
+ must be 4-byte aligned.
+
+ MSIs may be signaled by using this attribute group to write
+ to the relevant MSIIR.
+
+ KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit)
+ IRQ input line for each standard openpic source. 0 is inactive and 1
+ is active, regardless of interrupt sense.
+
+ For edge-triggered interrupts: Writing 1 is considered an activating
+ edge, and writing 0 is ignored. Reading returns 1 if a previously
+ signaled edge has not been acknowledged, and 0 otherwise.
+
+ "attr" is the IRQ number. IRQ numbers for standard sources are the
+ byte offset of the relevant IVPR from EIVPR0, divided by 32.
+
+IRQ Routing:
+
+ The MPIC emulation supports IRQ routing. Only a single MPIC device can
+ be instantiated. Once that device has been created, it's available as
+ irqchip id 0.
+
+ This irqchip 0 has 256 interrupt pins, which expose the interrupts in
+ the main array of interrupt sources (a.k.a. "SRC" interrupts).
+
+ The numbering is the same as the MPIC device tree binding -- based on
+ the register offset from the beginning of the sources array, without
+ regard to any subdivisions in chip documentation such as "internal"
+ or "external" interrupts.
+
+ Default routes are established for these pins, with the GSI being equal
+ to the pin number.
+
+ Access to non-SRC interrupts is not implemented through IRQ routing mechanisms.
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index cfa74983c675..989dd3fe8de1 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -26,6 +26,7 @@
#define KVM_USER_MEM_SLOTS 32
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
/* define exit reasons from vmm to kvm*/
#define EXIT_REASON_VM_PANIC 0
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 2cd225f8c68d..043183a06409 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
select PREEMPT_NOTIFIERS
select ANON_INODES
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQ_ROUTING
select KVM_APIC_ARCHITECTURE
select KVM_MMIO
---help---
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index db3d7c5d1071..511f64a78d56 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -49,7 +49,7 @@ ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o assigned-dev.o)
+ coalesced_mmio.o irq_comm.o assigned-dev.o irqchip.o)
ifeq ($(CONFIG_IOMMU_API),y)
common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 4bc2c3dad6ad..cf4df8e2139a 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -270,6 +270,9 @@
#define H_SET_MODE 0x31C
#define MAX_HCALL_OPCODE H_SET_MODE
+/* Platform specific hcalls, used by KVM */
+#define H_RTAS 0xf000
+
#ifndef __ASSEMBLY__
/**
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index bc81842ea25a..349ed85c7d61 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -142,6 +142,8 @@ extern int kvmppc_mmu_hv_init(void);
extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+ unsigned int vec);
extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
bool upper, u32 val);
@@ -156,7 +158,8 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
unsigned long pte_index);
extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
unsigned long *nb_ret);
-extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
+extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr,
+ unsigned long gpa, bool dirty);
extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel);
extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 38bec1dc9928..9c1ff330c805 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -268,4 +268,17 @@ static inline int is_vrma_hpte(unsigned long hpte_v)
(HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
}
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+ struct revmap_entry *rev)
+{
+ if (atomic_read(&kvm->arch.hpte_mod_interest))
+ rev->guest_rpte |= HPTE_GR_MODIFIED;
+}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index cdc3d2717cc6..9039d3c97eec 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -20,6 +20,11 @@
#ifndef __ASM_KVM_BOOK3S_ASM_H__
#define __ASM_KVM_BOOK3S_ASM_H__
+/* XICS ICP register offsets */
+#define XICS_XIRR 4
+#define XICS_MFRR 0xc
+#define XICS_IPI 2 /* interrupt source # for IPIs */
+
#ifdef __ASSEMBLY__
#ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -81,10 +86,11 @@ struct kvmppc_host_state {
#ifdef CONFIG_KVM_BOOK3S_64_HV
u8 hwthread_req;
u8 hwthread_state;
-
+ u8 host_ipi;
struct kvm_vcpu *kvm_vcpu;
struct kvmppc_vcore *kvm_vcore;
unsigned long xics_phys;
+ u32 saved_xirr;
u64 dabr;
u64 host_mmcr[3];
u32 host_pmc[8];
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e34f8fee9080..af326cde7cb6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -44,6 +44,10 @@
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
#endif
+/* These values are internal and can be increased later */
+#define KVM_NR_IRQCHIPS 1
+#define KVM_IRQCHIP_NUM_PINS 256
+
#if !defined(CONFIG_KVM_440)
#include <linux/mmu_notifier.h>
@@ -188,6 +192,10 @@ struct kvmppc_linear_info {
int type;
};
+/* XICS components, defined in book3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
/*
* The reverse mapping array has one entry for each HPTE,
* which stores the guest's view of the second word of the HPTE
@@ -255,6 +263,13 @@ struct kvm_arch {
#endif /* CONFIG_KVM_BOOK3S_64_HV */
#ifdef CONFIG_PPC_BOOK3S_64
struct list_head spapr_tce_tables;
+ struct list_head rtas_tokens;
+#endif
+#ifdef CONFIG_KVM_MPIC
+ struct openpic *mpic;
+#endif
+#ifdef CONFIG_KVM_XICS
+ struct kvmppc_xics *xics;
#endif
};
@@ -301,11 +316,13 @@ struct kvmppc_vcore {
* that a guest can register.
*/
struct kvmppc_vpa {
+ unsigned long gpa; /* Current guest phys addr */
void *pinned_addr; /* Address in kernel linear mapping */
void *pinned_end; /* End of region */
unsigned long next_gpa; /* Guest phys addr for update */
unsigned long len; /* Number of bytes required */
u8 update_pending; /* 1 => update pinned_addr from next_gpa */
+ bool dirty; /* true => area has been modified by kernel */
};
struct kvmppc_pte {
@@ -359,6 +376,11 @@ struct kvmppc_slb {
#define KVMPPC_BOOKE_MAX_IAC 4
#define KVMPPC_BOOKE_MAX_DAC 2
+/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */
+#define KVMPPC_EPR_NONE 0 /* EPR not supported */
+#define KVMPPC_EPR_USER 1 /* exit to userspace to fill EPR */
+#define KVMPPC_EPR_KERNEL 2 /* in-kernel irqchip */
+
struct kvmppc_booke_debug_reg {
u32 dbcr0;
u32 dbcr1;
@@ -370,6 +392,12 @@ struct kvmppc_booke_debug_reg {
u64 dac[KVMPPC_BOOKE_MAX_DAC];
};
+#define KVMPPC_IRQ_DEFAULT 0
+#define KVMPPC_IRQ_MPIC 1
+#define KVMPPC_IRQ_XICS 2
+
+struct openpic;
+
struct kvm_vcpu_arch {
ulong host_stack;
u32 host_pid;
@@ -502,7 +530,9 @@ struct kvm_vcpu_arch {
spinlock_t wdt_lock;
struct timer_list wdt_timer;
u32 tlbcfg[4];
+ u32 tlbps[4];
u32 mmucfg;
+ u32 eptcfg;
u32 epr;
u32 crit_save;
struct kvmppc_booke_debug_reg dbg_reg;
@@ -522,7 +552,7 @@ struct kvm_vcpu_arch {
u8 sane;
u8 cpu_type;
u8 hcall_needed;
- u8 epr_enabled;
+ u8 epr_flags; /* KVMPPC_EPR_xxx */
u8 epr_needed;
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -549,6 +579,13 @@ struct kvm_vcpu_arch {
unsigned long magic_page_pa; /* phys addr to map the magic page to */
unsigned long magic_page_ea; /* effect. addr to map the magic page to */
+ int irq_type; /* one of KVM_IRQ_* */
+ int irq_cpu_id;
+ struct openpic *mpic; /* KVM_IRQ_MPIC */
+#ifdef CONFIG_KVM_XICS
+ struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
+
#ifdef CONFIG_KVM_BOOK3S_64_HV
struct kvm_vcpu_arch_shared shregs;
@@ -589,5 +626,6 @@ struct kvm_vcpu_arch {
#define KVM_MMIO_REG_FQPR 0x0060
#define __KVM_HAVE_ARCH_WQP
+#define __KVM_HAVE_CREATE_DEVICE
#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index f58930779ae8..d7339df19259 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -44,7 +44,7 @@ enum emulation_result {
EMULATE_DO_DCR, /* kvm_run filled with DCR request */
EMULATE_FAIL, /* can't emulate this instruction */
EMULATE_AGAIN, /* something went wrong. go again */
- EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */
+ EMULATE_EXIT_USER, /* emulation requires exit to user-space */
};
extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -130,6 +130,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
struct kvm_memory_slot *memslot, unsigned long porder);
extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+
extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -164,6 +165,18 @@ extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+
+extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
+extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
+extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
+ u32 priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+ u32 *priority);
+extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
+extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
+
/*
* Cuts out inst bits with ordering according to spec.
* That means the leftmost bit is zero. All given bits are included.
@@ -245,12 +258,29 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
+struct openpic;
+
#ifdef CONFIG_KVM_BOOK3S_64_HV
static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
{
paca[cpu].kvm_hstate.xics_phys = addr;
}
+static inline u32 kvmppc_get_xics_latch(void)
+{
+ u32 xirr = get_paca()->kvm_hstate.saved_xirr;
+
+ get_paca()->kvm_hstate.saved_xirr = 0;
+
+ return xirr;
+}
+
+static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+{
+ paca[cpu].kvm_hstate.host_ipi = host_ipi;
+}
+
+extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
extern void kvm_linear_init(void);
#else
@@ -259,6 +289,44 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
static inline void kvm_linear_init(void)
{}
+
+static inline u32 kvmppc_get_xics_latch(void)
+{
+ return 0;
+}
+
+static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+{}
+
+static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ kvm_vcpu_kick(vcpu);
+}
+#endif
+
+#ifdef CONFIG_KVM_XICS
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
+}
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
+extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+#else
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+ { return 0; }
+static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
+ unsigned long server)
+ { return -EINVAL; }
+static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
+ struct kvm_irq_level *args)
+ { return -ENOTTY; }
+static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+ { return 0; }
#endif
static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
@@ -270,6 +338,32 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
#endif
}
+#ifdef CONFIG_KVM_MPIC
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+ u32 cpu);
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu);
+
+#else
+
+static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int kvmppc_mpic_connect_vcpu(struct kvm_device *dev,
+ struct kvm_vcpu *vcpu, u32 cpu)
+{
+ return -EINVAL;
+}
+
+static inline void kvmppc_mpic_disconnect_vcpu(struct openpic *opp,
+ struct kvm_vcpu *vcpu)
+{
+}
+
+#endif /* CONFIG_KVM_MPIC */
+
int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
struct kvm_config_tlb *cfg);
int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
@@ -282,8 +376,15 @@ void kvmppc_init_lpid(unsigned long nr_lpids);
static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
{
- /* Clear i-cache for new pages */
struct page *page;
+ /*
+ * We can only access pages that the kernel maps
+ * as memory. Bail out for unmapped ones.
+ */
+ if (!pfn_valid(pfn))
+ return;
+
+ /* Clear i-cache for new pages */
page = pfn_to_page(pfn);
if (!test_bit(PG_arch_1, &page->flags)) {
flush_dcache_icache_page(page);
@@ -323,4 +424,6 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
return ea;
}
+extern void xics_wake_cpu(int cpu);
+
#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c9c67fc888c9..799322433620 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -290,6 +290,7 @@
#define LPCR_PECE1 0x00002000 /* decrementer can cause exit */
#define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */
#define LPCR_MER 0x00000800 /* Mediated External Exception */
+#define LPCR_MER_SH 11
#define LPCR_LPES 0x0000000c
#define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */
#define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index c2ff99c01562..427b9aca2a0f 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -25,6 +25,8 @@
/* Select powerpc specific features in <linux/kvm.h> */
#define __KVM_HAVE_SPAPR_TCE
#define __KVM_HAVE_PPC_SMT
+#define __KVM_HAVE_IRQCHIP
+#define __KVM_HAVE_IRQ_LINE
struct kvm_regs {
__u64 pc;
@@ -272,8 +274,31 @@ struct kvm_debug_exit_arch {
/* for KVM_SET_GUEST_DEBUG */
struct kvm_guest_debug_arch {
+ struct {
+ /* H/W breakpoint/watchpoint address */
+ __u64 addr;
+ /*
+ * Type denotes h/w breakpoint, read watchpoint, write
+ * watchpoint or watchpoint (both read and write).
+ */
+#define KVMPPC_DEBUG_NONE 0x0
+#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1)
+#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2)
+#define KVMPPC_DEBUG_WATCH_READ (1UL << 3)
+ __u32 type;
+ __u32 reserved;
+ } bp[16];
};
+/* Debug related defines */
+/*
+ * kvm_guest_debug->control is a 32 bit field. The lower 16 bits are generic
+ * and upper 16 bits are architecture specific. Architecture specific defines
+ * that ioctl is for setting hardware breakpoint or software breakpoint.
+ */
+#define KVM_GUESTDBG_USE_SW_BP 0x00010000
+#define KVM_GUESTDBG_USE_HW_BP 0x00020000
+
/* definition of registers in kvm_run */
struct kvm_sync_regs {
};
@@ -299,6 +324,12 @@ struct kvm_allocate_rma {
__u64 rma_size;
};
+/* for KVM_CAP_PPC_RTAS */
+struct kvm_rtas_token_args {
+ char name[120];
+ __u64 token; /* Use a token of 0 to undefine a mapping */
+};
+
struct kvm_book3e_206_tlb_entry {
__u32 mas8;
__u32 mas1;
@@ -359,6 +390,26 @@ struct kvm_get_htab_header {
__u16 n_invalid;
};
+/* Per-vcpu XICS interrupt controller state */
+#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
+
+#define KVM_REG_PPC_ICP_CPPR_SHIFT 56 /* current proc priority */
+#define KVM_REG_PPC_ICP_CPPR_MASK 0xff
+#define KVM_REG_PPC_ICP_XISR_SHIFT 32 /* interrupt status field */
+#define KVM_REG_PPC_ICP_XISR_MASK 0xffffff
+#define KVM_REG_PPC_ICP_MFRR_SHIFT 24 /* pending IPI priority */
+#define KVM_REG_PPC_ICP_MFRR_MASK 0xff
+#define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */
+#define KVM_REG_PPC_ICP_PPRI_MASK 0xff
+
+/* Device control API: PPC-specific devices */
+#define KVM_DEV_MPIC_GRP_MISC 1
+#define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */
+
+#define KVM_DEV_MPIC_GRP_REGISTER 2 /* 32-bit */
+#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE 3 /* 32-bit */
+
+/* One-Reg API: PPC-specific registers */
#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
@@ -426,4 +477,26 @@ struct kvm_get_htab_header {
/* Debugging: Special instruction for software breakpoint */
#define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b)
+/* MMU registers */
+#define KVM_REG_PPC_MAS0 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8c)
+#define KVM_REG_PPC_MAS1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8d)
+#define KVM_REG_PPC_MAS2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8e)
+#define KVM_REG_PPC_MAS7_3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8f)
+#define KVM_REG_PPC_MAS4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x90)
+#define KVM_REG_PPC_MAS6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x91)
+#define KVM_REG_PPC_MMUCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x92)
+/*
+ * TLBnCFG fields TLBnCFG_N_ENTRY and TLBnCFG_ASSOC can be changed only using
+ * KVM_CAP_SW_TLB ioctl
+ */
+#define KVM_REG_PPC_TLB0CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x93)
+#define KVM_REG_PPC_TLB1CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94)
+#define KVM_REG_PPC_TLB2CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95)
+#define KVM_REG_PPC_TLB3CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96)
+#define KVM_REG_PPC_TLB0PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x97)
+#define KVM_REG_PPC_TLB1PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98)
+#define KVM_REG_PPC_TLB2PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99)
+#define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
+#define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)
+
#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index d87c90886c75..a791229329cf 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -477,6 +477,7 @@ int main(void)
DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
+ DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
#endif
#ifdef CONFIG_PPC_BOOK3S
DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -573,6 +574,8 @@ int main(void)
HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+ HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
+ HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
HSTATE_FIELD(HSTATE_PMC, host_pmc);
HSTATE_FIELD(HSTATE_PURR, host_purr);
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 3d7fd21c65f9..2f5c6b6d6877 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -124,6 +124,18 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return kvmppc_set_sregs_ivor(vcpu, sregs);
}
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ return -EINVAL;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ return -EINVAL;
+}
+
struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
{
struct kvmppc_vcpu_44x *vcpu_44x;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 63c67ec72e43..eb643f862579 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -136,21 +136,41 @@ config KVM_E500V2
If unsure, say N.
config KVM_E500MC
- bool "KVM support for PowerPC E500MC/E5500 processors"
+ bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
depends on PPC_E500MC
select KVM
select KVM_MMIO
select KVM_BOOKE_HV
select MMU_NOTIFIER
---help---
- Support running unmodified E500MC/E5500 (32-bit) guest kernels in
- virtual machines on E500MC/E5500 host processors.
+ Support running unmodified E500MC/E5500/E6500 guest kernels in
+ virtual machines on E500MC/E5500/E6500 host processors.
This module provides access to the hardware capabilities through
a character device node named /dev/kvm.
If unsure, say N.
+config KVM_MPIC
+ bool "KVM in-kernel MPIC emulation"
+ depends on KVM && E500
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_MSI
+ help
+ Enable support for emulating MPIC devices inside the
+ host kernel, rather than relying on userspace to emulate.
+ Currently, support is limited to certain versions of
+ Freescale's MPIC implementation.
+
+config KVM_XICS
+ bool "KVM in-kernel XICS emulation"
+ depends on KVM_BOOK3S_64 && !KVM_MPIC
+ ---help---
+ Include support for the XICS (eXternal Interrupt Controller
+ Specification) interrupt controller architecture used on
+ IBM POWER (pSeries) servers.
+
source drivers/vhost/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b772eded8c26..422de3f4d46c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -72,12 +72,18 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
book3s_hv.o \
book3s_hv_interrupts.o \
book3s_64_mmu_hv.o
+kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
+ book3s_hv_rm_xics.o
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
book3s_hv_rmhandlers.o \
book3s_hv_rm_mmu.o \
book3s_64_vio_hv.o \
book3s_hv_ras.o \
- book3s_hv_builtin.o
+ book3s_hv_builtin.o \
+ $(kvm-book3s_64-builtin-xics-objs-y)
+
+kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
+ book3s_xics.o
kvm-book3s_64-module-objs := \
../../../virt/kvm/kvm_main.o \
@@ -86,6 +92,7 @@ kvm-book3s_64-module-objs := \
emulate.o \
book3s.o \
book3s_64_vio.o \
+ book3s_rtas.o \
$(kvm-book3s_64-objs-y)
kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
@@ -103,6 +110,9 @@ kvm-book3s_32-objs := \
book3s_32_mmu.o
kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
+kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
+kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o)
+
kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
obj-$(CONFIG_KVM_440) += kvm.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 2d32ae4bc439..700df6f1d32c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
return prio;
}
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
unsigned int vec)
{
unsigned long old_pending = vcpu->arch.pending_exceptions;
@@ -535,6 +535,15 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
&opcode, sizeof(u32));
break;
}
+#ifdef CONFIG_KVM_XICS
+ case KVM_REG_PPC_ICP_STATE:
+ if (!vcpu->arch.icp) {
+ r = -ENXIO;
+ break;
+ }
+ val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu));
+ break;
+#endif /* CONFIG_KVM_XICS */
default:
r = -EINVAL;
break;
@@ -597,6 +606,16 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
break;
#endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_KVM_XICS
+ case KVM_REG_PPC_ICP_STATE:
+ if (!vcpu->arch.icp) {
+ r = -ENXIO;
+ break;
+ }
+ r = kvmppc_xics_set_icp(vcpu,
+ set_reg_val(reg->id, val));
+ break;
+#endif /* CONFIG_KVM_XICS */
default:
r = -EINVAL;
break;
@@ -612,6 +631,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
return 0;
}
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ return -EINVAL;
+}
+
void kvmppc_decrementer_func(unsigned long data)
{
struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 8cc18abd6dde..69efe0d6cedc 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -893,7 +893,10 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
/* Harvest R and C */
rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
- rev[i].guest_rpte = ptel | rcbits;
+ if (rcbits & ~rev[i].guest_rpte) {
+ rev[i].guest_rpte = ptel | rcbits;
+ note_hpte_modification(kvm, &rev[i]);
+ }
}
unlock_rmap(rmapp);
hptep[0] &= ~HPTE_V_HVLOCK;
@@ -976,7 +979,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
/* Now check and modify the HPTE */
if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
kvmppc_clear_ref_hpte(kvm, hptep, i);
- rev[i].guest_rpte |= HPTE_R_R;
+ if (!(rev[i].guest_rpte & HPTE_R_R)) {
+ rev[i].guest_rpte |= HPTE_R_R;
+ note_hpte_modification(kvm, &rev[i]);
+ }
ret = 1;
}
hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1080,7 +1086,10 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
hptep[1] &= ~HPTE_R_C;
eieio();
hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
- rev[i].guest_rpte |= HPTE_R_C;
+ if (!(rev[i].guest_rpte & HPTE_R_C)) {
+ rev[i].guest_rpte |= HPTE_R_C;
+ note_hpte_modification(kvm, &rev[i]);
+ }
ret = 1;
}
hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1090,11 +1099,30 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
return ret;
}
+static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+ struct kvm_memory_slot *memslot,
+ unsigned long *map)
+{
+ unsigned long gfn;
+
+ if (!vpa->dirty || !vpa->pinned_addr)
+ return;
+ gfn = vpa->gpa >> PAGE_SHIFT;
+ if (gfn < memslot->base_gfn ||
+ gfn >= memslot->base_gfn + memslot->npages)
+ return;
+
+ vpa->dirty = false;
+ if (map)
+ __set_bit_le(gfn - memslot->base_gfn, map);
+}
+
long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long *map)
{
unsigned long i;
unsigned long *rmapp;
+ struct kvm_vcpu *vcpu;
preempt_disable();
rmapp = memslot->arch.rmap;
@@ -1103,6 +1131,15 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
__set_bit_le(i, map);
++rmapp;
}
+
+ /* Harvest dirty bits from VPA and DTL updates */
+ /* Note: we never modify the SLB shadow buffer areas */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
+ harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ }
preempt_enable();
return 0;
}
@@ -1114,7 +1151,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
unsigned long gfn = gpa >> PAGE_SHIFT;
struct page *page, *pages[1];
int npages;
- unsigned long hva, psize, offset;
+ unsigned long hva, offset;
unsigned long pa;
unsigned long *physp;
int srcu_idx;
@@ -1146,14 +1183,9 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
- psize = PAGE_SIZE;
- if (PageHuge(page)) {
- page = compound_head(page);
- psize <<= compound_order(page);
- }
- offset = gpa & (psize - 1);
+ offset = gpa & (PAGE_SIZE - 1);
if (nb_ret)
- *nb_ret = psize - offset;
+ *nb_ret = PAGE_SIZE - offset;
return page_address(page) + offset;
err:
@@ -1161,11 +1193,31 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
return NULL;
}
-void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
+void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
+ bool dirty)
{
struct page *page = virt_to_page(va);
+ struct kvm_memory_slot *memslot;
+ unsigned long gfn;
+ unsigned long *rmap;
+ int srcu_idx;
put_page(page);
+
+ if (!dirty || !kvm->arch.using_mmu_notifiers)
+ return;
+
+ /* We need to mark this page dirty in the rmap chain */
+ gfn = gpa >> PAGE_SHIFT;
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ memslot = gfn_to_memslot(kvm, gfn);
+ if (memslot) {
+ rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ lock_rmap(rmap);
+ *rmap |= KVMPPC_RMAP_CHANGED;
+ unlock_rmap(rmap);
+ }
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
}
/*
@@ -1193,16 +1245,36 @@ struct kvm_htab_ctx {
#define HPTE_SIZE (2 * sizeof(unsigned long))
+/*
+ * Returns 1 if this HPT entry has been modified or has pending
+ * R/C bit changes.
+ */
+static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp)
+{
+ unsigned long rcbits_unset;
+
+ if (revp->guest_rpte & HPTE_GR_MODIFIED)
+ return 1;
+
+ /* Also need to consider changes in reference and changed bits */
+ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+ if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset))
+ return 1;
+
+ return 0;
+}
+
static long record_hpte(unsigned long flags, unsigned long *hptp,
unsigned long *hpte, struct revmap_entry *revp,
int want_valid, int first_pass)
{
unsigned long v, r;
+ unsigned long rcbits_unset;
int ok = 1;
int valid, dirty;
/* Unmodified entries are uninteresting except on the first pass */
- dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+ dirty = hpte_dirty(revp, hptp);
if (!first_pass && !dirty)
return 0;
@@ -1223,16 +1295,28 @@ static long record_hpte(unsigned long flags, unsigned long *hptp,
while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
cpu_relax();
v = hptp[0];
+
+ /* re-evaluate valid and dirty from synchronized HPTE value */
+ valid = !!(v & HPTE_V_VALID);
+ dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+ /* Harvest R and C into guest view if necessary */
+ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+ if (valid && (rcbits_unset & hptp[1])) {
+ revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) |
+ HPTE_GR_MODIFIED;
+ dirty = 1;
+ }
+
if (v & HPTE_V_ABSENT) {
v &= ~HPTE_V_ABSENT;
v |= HPTE_V_VALID;
+ valid = 1;
}
- /* re-evaluate valid and dirty from synchronized HPTE value */
- valid = !!(v & HPTE_V_VALID);
if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
valid = 0;
- r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
- dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+ r = revp->guest_rpte;
/* only clear modified if this is the right sort of entry */
if (valid == want_valid && dirty) {
r &= ~HPTE_GR_MODIFIED;
@@ -1288,7 +1372,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
/* Skip uninteresting entries, i.e. clean on not-first pass */
if (!first_pass) {
while (i < kvm->arch.hpt_npte &&
- !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
+ !hpte_dirty(revp, hptp)) {
++i;
hptp += 2;
++revp;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 836c56975e21..1f6344c4408d 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -194,7 +194,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
run->papr_hcall.args[i] = gpr;
}
- emulated = EMULATE_DO_PAPR;
+ run->exit_reason = KVM_EXIT_PAPR_HCALL;
+ vcpu->arch.hcall_needed = 1;
+ emulated = EMULATE_EXIT_USER;
break;
}
#endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1e521baf9a7d..178521e81ce4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -66,6 +66,31 @@
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ int me;
+ int cpu = vcpu->cpu;
+ wait_queue_head_t *wqp;
+
+ wqp = kvm_arch_vcpu_wq(vcpu);
+ if (waitqueue_active(wqp)) {
+ wake_up_interruptible(wqp);
+ ++vcpu->stat.halt_wakeup;
+ }
+
+ me = get_cpu();
+
+ /* CPU points to the first thread of the core */
+ if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
+ int real_cpu = cpu + vcpu->arch.ptid;
+ if (paca[real_cpu].kvm_hstate.xics_phys)
+ xics_wake_cpu(real_cpu);
+ else if (cpu_online(cpu))
+ smp_send_reschedule(cpu);
+ }
+ put_cpu();
+}
+
/*
* We use the vcpu_load/put functions to measure stolen time.
* Stolen time is counted as time when either the vcpu is able to
@@ -259,7 +284,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
len = ((struct reg_vpa *)va)->length.hword;
else
len = ((struct reg_vpa *)va)->length.word;
- kvmppc_unpin_guest_page(kvm, va);
+ kvmppc_unpin_guest_page(kvm, va, vpa, false);
/* Check length */
if (len > nb || len < sizeof(struct reg_vpa))
@@ -359,13 +384,13 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
va = NULL;
nb = 0;
if (gpa)
- va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb);
+ va = kvmppc_pin_guest_page(kvm, gpa, &nb);
spin_lock(&vcpu->arch.vpa_update_lock);
if (gpa == vpap->next_gpa)
break;
/* sigh... unpin that one and try again */
if (va)
- kvmppc_unpin_guest_page(kvm, va);
+ kvmppc_unpin_guest_page(kvm, va, gpa, false);
}
vpap->update_pending = 0;
@@ -375,12 +400,15 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
* has changed the mappings underlying guest memory,
* so unregister the region.
*/
- kvmppc_unpin_guest_page(kvm, va);
+ kvmppc_unpin_guest_page(kvm, va, gpa, false);
va = NULL;
}
if (vpap->pinned_addr)
- kvmppc_unpin_guest_page(kvm, vpap->pinned_addr);
+ kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
+ vpap->dirty);
+ vpap->gpa = gpa;
vpap->pinned_addr = va;
+ vpap->dirty = false;
if (va)
vpap->pinned_end = va + vpap->len;
}
@@ -472,6 +500,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
/* order writing *dt vs. writing vpa->dtl_idx */
smp_wmb();
vpa->dtl_idx = ++vcpu->arch.dtl_index;
+ vcpu->arch.dtl.dirty = true;
}
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -479,7 +508,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
unsigned long req = kvmppc_get_gpr(vcpu, 3);
unsigned long target, ret = H_SUCCESS;
struct kvm_vcpu *tvcpu;
- int idx;
+ int idx, rc;
switch (req) {
case H_ENTER:
@@ -515,6 +544,28 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6));
break;
+ case H_RTAS:
+ if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+ return RESUME_HOST;
+
+ rc = kvmppc_rtas_hcall(vcpu);
+
+ if (rc == -ENOENT)
+ return RESUME_HOST;
+ else if (rc == 0)
+ break;
+
+ /* Send the error out to userspace via KVM_RUN */
+ return rc;
+
+ case H_XIRR:
+ case H_CPPR:
+ case H_EOI:
+ case H_IPI:
+ if (kvmppc_xics_enabled(vcpu)) {
+ ret = kvmppc_xics_hcall(vcpu, req);
+ break;
+ } /* fallthrough */
default:
return RESUME_HOST;
}
@@ -913,15 +964,19 @@ out:
return ERR_PTR(err);
}
+static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
+{
+ if (vpa->pinned_addr)
+ kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
+ vpa->dirty);
+}
+
void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
{
spin_lock(&vcpu->arch.vpa_update_lock);
- if (vcpu->arch.dtl.pinned_addr)
- kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr);
- if (vcpu->arch.slb_shadow.pinned_addr)
- kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr);
- if (vcpu->arch.vpa.pinned_addr)
- kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
+ unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
+ unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
+ unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
spin_unlock(&vcpu->arch.vpa_update_lock);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vcpu);
@@ -955,7 +1010,6 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
}
extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-extern void xics_wake_cpu(int cpu);
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
struct kvm_vcpu *vcpu)
@@ -1330,9 +1384,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
break;
vc->runner = vcpu;
n_ceded = 0;
- list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+ list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
if (!v->arch.pending_exceptions)
n_ceded += v->arch.ceded;
+ else
+ v->arch.ceded = 0;
+ }
if (n_ceded == vc->n_runnable)
kvmppc_vcore_blocked(vc);
else
@@ -1821,6 +1878,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
cpumask_setall(&kvm->arch.need_tlb_flush);
INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+ INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
kvm->arch.rma = NULL;
@@ -1866,6 +1924,8 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
kvm->arch.rma = NULL;
}
+ kvmppc_rtas_tokens_free(kvm);
+
kvmppc_free_hpt(kvm);
WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 19c93bae1aea..6dcbb49105a4 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -97,17 +97,6 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
}
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
-/*
- * Note modification of an HPTE; set the HPTE modified bit
- * if anyone is interested.
- */
-static inline void note_hpte_modification(struct kvm *kvm,
- struct revmap_entry *rev)
-{
- if (atomic_read(&kvm->arch.hpte_mod_interest))
- rev->guest_rpte |= HPTE_GR_MODIFIED;
-}
-
/* Remove this HPTE from the chain for a real page */
static void remove_revmap_chain(struct kvm *kvm, long pte_index,
struct revmap_entry *rev,
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
new file mode 100644
index 000000000000..b4b0082f761c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#include "book3s_xics.h"
+
+#define DEBUG_PASSUP
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+ __asm__ __volatile__("sync; stbcix %0,0,%1"
+ : : "r" (val), "r" (paddr) : "memory");
+}
+
+static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu *this_vcpu)
+{
+ struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
+ unsigned long xics_phys;
+ int cpu;
+
+ /* Mark the target VCPU as having an interrupt pending */
+ vcpu->stat.queue_intr++;
+ set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+
+ /* Kick self ? Just set MER and return */
+ if (vcpu == this_vcpu) {
+ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER);
+ return;
+ }
+
+ /* Check if the core is loaded, if not, too hard */
+ cpu = vcpu->cpu;
+ if (cpu < 0 || cpu >= nr_cpu_ids) {
+ this_icp->rm_action |= XICS_RM_KICK_VCPU;
+ this_icp->rm_kick_target = vcpu;
+ return;
+ }
+ /* In SMT cpu will always point to thread 0, we adjust it */
+ cpu += vcpu->arch.ptid;
+
+ /* Not too hard, then poke the target */
+ xics_phys = paca[cpu].kvm_hstate.xics_phys;
+ rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
+{
+ /* Note: Only called on self ! */
+ clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
+ &vcpu->arch.pending_exceptions);
+ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
+}
+
+static inline bool icp_rm_try_update(struct kvmppc_icp *icp,
+ union kvmppc_icp_state old,
+ union kvmppc_icp_state new)
+{
+ struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu;
+ bool success;
+
+ /* Calculate new output value */
+ new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+ /* Attempt atomic update */
+ success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+ if (!success)
+ goto bail;
+
+ /*
+ * Check for output state update
+ *
+ * Note that this is racy since another processor could be updating
+ * the state already. This is why we never clear the interrupt output
+ * here, we only ever set it. The clear only happens prior to doing
+ * an update and only by the processor itself. Currently we do it
+ * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+ *
+ * We also do not try to figure out whether the EE state has changed,
+ * we unconditionally set it if the new state calls for it. The reason
+ * for that is that we opportunistically remove the pending interrupt
+ * flag when raising CPPR, so we need to set it back here if an
+ * interrupt is still pending.
+ */
+ if (new.out_ee)
+ icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu);
+
+ /* Expose the state change for debug purposes */
+ this_vcpu->arch.icp->rm_dbgstate = new;
+ this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu;
+
+ bail:
+ return success;
+}
+
+static inline int check_too_hard(struct kvmppc_xics *xics,
+ struct kvmppc_icp *icp)
+{
+ return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
+}
+
+static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+ u8 new_cppr)
+{
+ union kvmppc_icp_state old_state, new_state;
+ bool resend;
+
+ /*
+ * This handles several related states in one operation:
+ *
+ * ICP State: Down_CPPR
+ *
+ * Load CPPR with new value and if the XISR is 0
+ * then check for resends:
+ *
+ * ICP State: Resend
+ *
+ * If MFRR is more favored than CPPR, check for IPIs
+ * and notify ICS of a potential resend. This is done
+ * asynchronously (when used in real mode, we will have
+ * to exit here).
+ *
+ * We do not handle the complete Check_IPI as documented
+ * here. In the PAPR, this state will be used for both
+ * Set_MFRR and Down_CPPR. However, we know that we aren't
+ * changing the MFRR state here so we don't need to handle
+ * the case of an MFRR causing a reject of a pending irq,
+ * this will have been handled when the MFRR was set in the
+ * first place.
+ *
+ * Thus we don't have to handle rejects, only resends.
+ *
+ * When implementing real mode for HV KVM, resend will lead to
+ * a H_TOO_HARD return and the whole transaction will be handled
+ * in virtual mode.
+ */
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ /* Down_CPPR */
+ new_state.cppr = new_cppr;
+
+ /*
+ * Cut down Resend / Check_IPI / IPI
+ *
+ * The logic is that we cannot have a pending interrupt
+ * trumped by an IPI at this point (see above), so we
+ * know that either the pending interrupt is already an
+ * IPI (in which case we don't care to override it) or
+ * it's either more favored than us or non existent
+ */
+ if (new_state.mfrr < new_cppr &&
+ new_state.mfrr <= new_state.pending_pri) {
+ new_state.pending_pri = new_state.mfrr;
+ new_state.xisr = XICS_IPI;
+ }
+
+ /* Latch/clear resend bit */
+ resend = new_state.need_resend;
+ new_state.need_resend = 0;
+
+ } while (!icp_rm_try_update(icp, old_state, new_state));
+
+ /*
+ * Now handle resend checks. Those are asynchronous to the ICP
+ * state update in HW (ie bus transactions) so we can handle them
+ * separately here as well.
+ */
+ if (resend)
+ icp->rm_action |= XICS_RM_CHECK_RESEND;
+}
+
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+ union kvmppc_icp_state old_state, new_state;
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 xirr;
+
+ if (!xics || !xics->real_mode)
+ return H_TOO_HARD;
+
+ /* First clear the interrupt */
+ icp_rm_clr_vcpu_irq(icp->vcpu);
+
+ /*
+ * ICP State: Accept_Interrupt
+ *
+ * Return the pending interrupt (if any) along with the
+ * current CPPR, then clear the XISR & set CPPR to the
+ * pending priority
+ */
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+ if (!old_state.xisr)
+ break;
+ new_state.cppr = new_state.pending_pri;
+ new_state.pending_pri = 0xff;
+ new_state.xisr = 0;
+
+ } while (!icp_rm_try_update(icp, old_state, new_state));
+
+ /* Return the result in GPR4 */
+ vcpu->arch.gpr[4] = xirr;
+
+ return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+ unsigned long mfrr)
+{
+ union kvmppc_icp_state old_state, new_state;
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp;
+ u32 reject;
+ bool resend;
+ bool local;
+
+ if (!xics || !xics->real_mode)
+ return H_TOO_HARD;
+
+ local = this_icp->server_num == server;
+ if (local)
+ icp = this_icp;
+ else
+ icp = kvmppc_xics_find_server(vcpu->kvm, server);
+ if (!icp)
+ return H_PARAMETER;
+
+ /*
+ * ICP state: Set_MFRR
+ *
+ * If the CPPR is more favored than the new MFRR, then
+ * nothing needs to be done as there can be no XISR to
+ * reject.
+ *
+ * If the CPPR is less favored, then we might be replacing
+ * an interrupt, and thus need to possibly reject it as in
+ *
+ * ICP state: Check_IPI
+ */
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ /* Set_MFRR */
+ new_state.mfrr = mfrr;
+
+ /* Check_IPI */
+ reject = 0;
+ resend = false;
+ if (mfrr < new_state.cppr) {
+ /* Reject a pending interrupt if not an IPI */
+ if (mfrr <= new_state.pending_pri)
+ reject = new_state.xisr;
+ new_state.pending_pri = mfrr;
+ new_state.xisr = XICS_IPI;
+ }
+
+ if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+ resend = new_state.need_resend;
+ new_state.need_resend = 0;
+ }
+ } while (!icp_rm_try_update(icp, old_state, new_state));
+
+ /* Pass rejects to virtual mode */
+ if (reject && reject != XICS_IPI) {
+ this_icp->rm_action |= XICS_RM_REJECT;
+ this_icp->rm_reject = reject;
+ }
+
+ /* Pass resends to virtual mode */
+ if (resend)
+ this_icp->rm_action |= XICS_RM_CHECK_RESEND;
+
+ return check_too_hard(xics, this_icp);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+ union kvmppc_icp_state old_state, new_state;
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 reject;
+
+ if (!xics || !xics->real_mode)
+ return H_TOO_HARD;
+
+ /*
+ * ICP State: Set_CPPR
+ *
+ * We can safely compare the new value with the current
+ * value outside of the transaction as the CPPR is only
+ * ever changed by the processor on itself
+ */
+ if (cppr > icp->state.cppr) {
+ icp_rm_down_cppr(xics, icp, cppr);
+ goto bail;
+ } else if (cppr == icp->state.cppr)
+ return H_SUCCESS;
+
+ /*
+ * ICP State: Up_CPPR
+ *
+ * The processor is raising its priority, this can result
+ * in a rejection of a pending interrupt:
+ *
+ * ICP State: Reject_Current
+ *
+ * We can remove EE from the current processor, the update
+ * transaction will set it again if needed
+ */
+ icp_rm_clr_vcpu_irq(icp->vcpu);
+
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ reject = 0;
+ new_state.cppr = cppr;
+
+ if (cppr <= new_state.pending_pri) {
+ reject = new_state.xisr;
+ new_state.xisr = 0;
+ new_state.pending_pri = 0xff;
+ }
+
+ } while (!icp_rm_try_update(icp, old_state, new_state));
+
+ /* Pass rejects to virtual mode */
+ if (reject && reject != XICS_IPI) {
+ icp->rm_action |= XICS_RM_REJECT;
+ icp->rm_reject = reject;
+ }
+ bail:
+ return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
+ u32 irq = xirr & 0x00ffffff;
+ u16 src;
+
+ if (!xics || !xics->real_mode)
+ return H_TOO_HARD;
+
+ /*
+ * ICP State: EOI
+ *
+ * Note: If EOI is incorrectly used by SW to lower the CPPR
+ * value (ie more favored), we do not check for rejection of
+ * a pending interrupt, this is a SW error and PAPR sepcifies
+ * that we don't have to deal with it.
+ *
+ * The sending of an EOI to the ICS is handled after the
+ * CPPR update
+ *
+ * ICP State: Down_CPPR which we handle
+ * in a separate function as it's shared with H_CPPR.
+ */
+ icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+ /* IPIs have no EOI */
+ if (irq == XICS_IPI)
+ goto bail;
+ /*
+ * EOI handling: If the interrupt is still asserted, we need to
+ * resend it. We can take a lockless "peek" at the ICS state here.
+ *
+ * "Message" interrupts will never have "asserted" set
+ */
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ goto bail;
+ state = &ics->irq_state[src];
+
+ /* Still asserted, resend it, we make it look like a reject */
+ if (state->asserted) {
+ icp->rm_action |= XICS_RM_REJECT;
+ icp->rm_reject = irq;
+ }
+ bail:
+ return check_too_hard(xics, icp);
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e33d11f1b977..b02f91e4c70d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -79,10 +79,6 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
* *
*****************************************************************************/
-#define XICS_XIRR 4
-#define XICS_QIRR 0xc
-#define XICS_IPI 2 /* interrupt source # for IPIs */
-
/*
* We come in here when wakened from nap mode on a secondary hw thread.
* Relocation is off and most register values are lost.
@@ -101,50 +97,51 @@ kvm_start_guest:
li r0,1
stb r0,PACA_NAPSTATELOST(r13)
- /* get vcpu pointer, NULL if we have no vcpu to run */
- ld r4,HSTATE_KVM_VCPU(r13)
- cmpdi cr1,r4,0
+ /* were we napping due to cede? */
+ lbz r0,HSTATE_NAPPING(r13)
+ cmpwi r0,0
+ bne kvm_end_cede
+
+ /*
+ * We weren't napping due to cede, so this must be a secondary
+ * thread being woken up to run a guest, or being woken up due
+ * to a stray IPI. (Or due to some machine check or hypervisor
+ * maintenance interrupt while the core is in KVM.)
+ */
/* Check the wake reason in SRR1 to see why we got here */
mfspr r3,SPRN_SRR1
rlwinm r3,r3,44-31,0x7 /* extract wake reason field */
cmpwi r3,4 /* was it an external interrupt? */
- bne 27f
-
- /*
- * External interrupt - for now assume it is an IPI, since we
- * should never get any other interrupts sent to offline threads.
- * Only do this for secondary threads.
- */
- beq cr1,25f
- lwz r3,VCPU_PTID(r4)
- cmpwi r3,0
- beq 27f
-25: ld r5,HSTATE_XICS_PHYS(r13)
- li r0,0xff
- li r6,XICS_QIRR
- li r7,XICS_XIRR
+ bne 27f /* if not */
+ ld r5,HSTATE_XICS_PHYS(r13)
+ li r7,XICS_XIRR /* if it was an external interrupt, */
lwzcix r8,r5,r7 /* get and ack the interrupt */
sync
clrldi. r9,r8,40 /* get interrupt source ID. */
- beq 27f /* none there? */
- cmpwi r9,XICS_IPI
- bne 26f
+ beq 28f /* none there? */
+ cmpwi r9,XICS_IPI /* was it an IPI? */
+ bne 29f
+ li r0,0xff
+ li r6,XICS_MFRR
stbcix r0,r5,r6 /* clear IPI */
-26: stwcix r8,r5,r7 /* EOI the interrupt */
-
-27: /* XXX should handle hypervisor maintenance interrupts etc. here */
+ stwcix r8,r5,r7 /* EOI the interrupt */
+ sync /* order loading of vcpu after that */
- /* reload vcpu pointer after clearing the IPI */
+ /* get vcpu pointer, NULL if we have no vcpu to run */
ld r4,HSTATE_KVM_VCPU(r13)
cmpdi r4,0
/* if we have no vcpu to run, go back to sleep */
beq kvm_no_guest
+ b kvmppc_hv_entry
- /* were we napping due to cede? */
- lbz r0,HSTATE_NAPPING(r13)
- cmpwi r0,0
- bne kvm_end_cede
+27: /* XXX should handle hypervisor maintenance interrupts etc. here */
+ b kvm_no_guest
+28: /* SRR1 said external but ICP said nope?? */
+ b kvm_no_guest
+29: /* External non-IPI interrupt to offline secondary thread? help?? */
+ stw r8,HSTATE_SAVED_XIRR(r13)
+ b kvm_no_guest
.global kvmppc_hv_entry
kvmppc_hv_entry:
@@ -260,6 +257,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
lwz r5, LPPACA_YIELDCOUNT(r3)
addi r5, r5, 1
stw r5, LPPACA_YIELDCOUNT(r3)
+ li r6, 1
+ stb r6, VCPU_VPA_DIRTY(r4)
25:
/* Load up DAR and DSISR */
ld r5, VCPU_DAR(r4)
@@ -485,20 +484,20 @@ toc_tlbie_lock:
mtctr r6
mtxer r7
+ ld r10, VCPU_PC(r4)
+ ld r11, VCPU_MSR(r4)
kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
ld r6, VCPU_SRR0(r4)
ld r7, VCPU_SRR1(r4)
- ld r10, VCPU_PC(r4)
- ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */
+ /* r11 = vcpu->arch.msr & ~MSR_HV */
rldicl r11, r11, 63 - MSR_HV_LG, 1
rotldi r11, r11, 1 + MSR_HV_LG
ori r11, r11, MSR_ME
/* Check if we can deliver an external or decrementer interrupt now */
ld r0,VCPU_PENDING_EXC(r4)
- li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
- oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+ lis r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
and r0,r0,r8
cmpdi cr1,r0,0
andi. r0,r11,MSR_EE
@@ -526,10 +525,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
/* Move SRR0 and SRR1 into the respective regs */
5: mtspr SPRN_SRR0, r6
mtspr SPRN_SRR1, r7
- li r0,0
- stb r0,VCPU_CEDED(r4) /* cancel cede */
fast_guest_return:
+ li r0,0
+ stb r0,VCPU_CEDED(r4) /* cancel cede */
mtspr SPRN_HSRR0,r10
mtspr SPRN_HSRR1,r11
@@ -676,17 +675,99 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
beq hcall_try_real_mode
- /* Check for mediated interrupts (could be done earlier really ...) */
+ /* Only handle external interrupts here on arch 206 and later */
BEGIN_FTR_SECTION
- cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL
- bne+ 1f
- andi. r0,r11,MSR_EE
- beq 1f
- mfspr r5,SPRN_LPCR
- andi. r0,r5,LPCR_MER
- bne bounce_ext_interrupt
-1:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+ b ext_interrupt_to_host
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
+
+ /* External interrupt ? */
+ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
+ bne+ ext_interrupt_to_host
+
+ /* External interrupt, first check for host_ipi. If this is
+ * set, we know the host wants us out so let's do it now
+ */
+do_ext_interrupt:
+ lbz r0, HSTATE_HOST_IPI(r13)
+ cmpwi r0, 0
+ bne ext_interrupt_to_host
+
+ /* Now read the interrupt from the ICP */
+ ld r5, HSTATE_XICS_PHYS(r13)
+ li r7, XICS_XIRR
+ cmpdi r5, 0
+ beq- ext_interrupt_to_host
+ lwzcix r3, r5, r7
+ rlwinm. r0, r3, 0, 0xffffff
+ sync
+ beq 3f /* if nothing pending in the ICP */
+
+ /* We found something in the ICP...
+ *
+ * If it's not an IPI, stash it in the PACA and return to
+ * the host, we don't (yet) handle directing real external
+ * interrupts directly to the guest
+ */
+ cmpwi r0, XICS_IPI
+ bne ext_stash_for_host
+
+ /* It's an IPI, clear the MFRR and EOI it */
+ li r0, 0xff
+ li r6, XICS_MFRR
+ stbcix r0, r5, r6 /* clear the IPI */
+ stwcix r3, r5, r7 /* EOI it */
+ sync
+
+ /* We need to re-check host IPI now in case it got set in the
+ * meantime. If it's clear, we bounce the interrupt to the
+ * guest
+ */
+ lbz r0, HSTATE_HOST_IPI(r13)
+ cmpwi r0, 0
+ bne- 1f
+
+ /* Allright, looks like an IPI for the guest, we need to set MER */
+3:
+ /* Check if any CPU is heading out to the host, if so head out too */
+ ld r5, HSTATE_KVM_VCORE(r13)
+ lwz r0, VCORE_ENTRY_EXIT(r5)
+ cmpwi r0, 0x100
+ bge ext_interrupt_to_host
+
+ /* See if there is a pending interrupt for the guest */
+ mfspr r8, SPRN_LPCR
+ ld r0, VCPU_PENDING_EXC(r9)
+ /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
+ rldicl. r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
+ rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
+ beq 2f
+
+ /* And if the guest EE is set, we can deliver immediately, else
+ * we return to the guest with MER set
+ */
+ andi. r0, r11, MSR_EE
+ beq 2f
+ mtspr SPRN_SRR0, r10
+ mtspr SPRN_SRR1, r11
+ li r10, BOOK3S_INTERRUPT_EXTERNAL
+ li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
+ rotldi r11, r11, 63
+2: mr r4, r9
+ mtspr SPRN_LPCR, r8
+ b fast_guest_return
+
+ /* We raced with the host, we need to resend that IPI, bummer */
+1: li r0, IPI_PRIORITY
+ stbcix r0, r5, r6 /* set the IPI */
+ sync
+ b ext_interrupt_to_host
+
+ext_stash_for_host:
+ /* It's not an IPI and it's for the host, stash it in the PACA
+ * before exit, it will be picked up by the host ICP driver
+ */
+ stw r3, HSTATE_SAVED_XIRR(r13)
+ext_interrupt_to_host:
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Save DEC */
@@ -829,7 +910,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
beq 44f
ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
li r0,IPI_PRIORITY
- li r7,XICS_QIRR
+ li r7,XICS_MFRR
stbcix r0,r7,r8 /* trigger the IPI */
44: srdi. r3,r3,1
addi r6,r6,PACA_SIZE
@@ -1018,6 +1099,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
lwz r3, LPPACA_YIELDCOUNT(r8)
addi r3, r3, 1
stw r3, LPPACA_YIELDCOUNT(r8)
+ li r3, 1
+ stb r3, VCPU_VPA_DIRTY(r9)
25:
/* Save PMU registers if requested */
/* r8 and cr0.eq are live here */
@@ -1350,11 +1433,19 @@ hcall_real_table:
.long 0 /* 0x58 */
.long 0 /* 0x5c */
.long 0 /* 0x60 */
- .long 0 /* 0x64 */
- .long 0 /* 0x68 */
- .long 0 /* 0x6c */
- .long 0 /* 0x70 */
- .long 0 /* 0x74 */
+#ifdef CONFIG_KVM_XICS
+ .long .kvmppc_rm_h_eoi - hcall_real_table
+ .long .kvmppc_rm_h_cppr - hcall_real_table
+ .long .kvmppc_rm_h_ipi - hcall_real_table
+ .long 0 /* 0x70 - H_IPOLL */
+ .long .kvmppc_rm_h_xirr - hcall_real_table
+#else
+ .long 0 /* 0x64 - H_EOI */
+ .long 0 /* 0x68 - H_CPPR */
+ .long 0 /* 0x6c - H_IPI */
+ .long 0 /* 0x70 - H_IPOLL */
+ .long 0 /* 0x74 - H_XIRR */
+#endif
.long 0 /* 0x78 */
.long 0 /* 0x7c */
.long 0 /* 0x80 */
@@ -1405,15 +1496,6 @@ ignore_hdec:
mr r4,r9
b fast_guest_return
-bounce_ext_interrupt:
- mr r4,r9
- mtspr SPRN_SRR0,r10
- mtspr SPRN_SRR1,r11
- li r10,BOOK3S_INTERRUPT_EXTERNAL
- li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
- rotldi r11,r11,63
- b fast_guest_return
-
_GLOBAL(kvmppc_h_set_dabr)
std r4,VCPU_DABR(r3)
/* Work around P7 bug where DABR can get corrupted on mtspr */
@@ -1519,6 +1601,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
b .
kvm_end_cede:
+ /* get vcpu pointer */
+ ld r4, HSTATE_KVM_VCPU(r13)
+
/* Woken by external or decrementer interrupt */
ld r1, HSTATE_HOST_R1(r13)
@@ -1558,6 +1643,16 @@ kvm_end_cede:
li r0,0
stb r0,HSTATE_NAPPING(r13)
+ /* Check the wake reason in SRR1 to see why we got here */
+ mfspr r3, SPRN_SRR1
+ rlwinm r3, r3, 44-31, 0x7 /* extract wake reason field */
+ cmpwi r3, 4 /* was it an external interrupt? */
+ li r12, BOOK3S_INTERRUPT_EXTERNAL
+ mr r9, r4
+ ld r10, VCPU_PC(r9)
+ ld r11, VCPU_MSR(r9)
+ beq do_ext_interrupt /* if so */
+
/* see if any other thread is already exiting */
lwz r0,VCORE_ENTRY_EXIT(r5)
cmpwi r0,0x100
@@ -1577,8 +1672,7 @@ kvm_cede_prodded:
/* we've ceded but we want to give control to the host */
kvm_cede_exit:
- li r3,H_TOO_HARD
- blr
+ b hcall_real_fallback
/* Try to handle a machine check in real mode */
machine_check_realmode:
@@ -1626,7 +1720,7 @@ secondary_nap:
beq 37f
sync
li r0, 0xff
- li r6, XICS_QIRR
+ li r6, XICS_MFRR
stbcix r0, r5, r6 /* clear the IPI */
stwcix r3, r5, r7 /* EOI it */
37: sync
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 286e23e6b92d..d09baf143500 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -762,9 +762,7 @@ program_interrupt:
run->exit_reason = KVM_EXIT_MMIO;
r = RESUME_HOST_NV;
break;
- case EMULATE_DO_PAPR:
- run->exit_reason = KVM_EXIT_PAPR_HCALL;
- vcpu->arch.hcall_needed = 1;
+ case EMULATE_EXIT_USER:
r = RESUME_HOST_NV;
break;
default:
@@ -1298,6 +1296,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
{
#ifdef CONFIG_PPC64
INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+ INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
#endif
if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index ee02b30878ed..b24309c6c2d5 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -227,6 +227,13 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
return EMULATE_DONE;
}
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+ long rc = kvmppc_xics_hcall(vcpu, cmd);
+ kvmppc_set_gpr(vcpu, 3, rc);
+ return EMULATE_DONE;
+}
+
int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
{
switch (cmd) {
@@ -246,6 +253,20 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
vcpu->stat.halt_wakeup++;
return EMULATE_DONE;
+ case H_XIRR:
+ case H_CPPR:
+ case H_EOI:
+ case H_IPI:
+ if (kvmppc_xics_enabled(vcpu))
+ return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+ break;
+ case H_RTAS:
+ if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+ return RESUME_HOST;
+ if (kvmppc_rtas_hcall(vcpu))
+ break;
+ kvmppc_set_gpr(vcpu, 3, 0);
+ return EMULATE_DONE;
}
return EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
new file mode 100644
index 000000000000..3219ba895246
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/err.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/rtas.h>
+
+#ifdef CONFIG_KVM_XICS
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+ u32 irq, server, priority;
+ int rc;
+
+ if (args->nargs != 3 || args->nret != 1) {
+ rc = -3;
+ goto out;
+ }
+
+ irq = args->args[0];
+ server = args->args[1];
+ priority = args->args[2];
+
+ rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+ if (rc)
+ rc = -3;
+out:
+ args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+ u32 irq, server, priority;
+ int rc;
+
+ if (args->nargs != 1 || args->nret != 3) {
+ rc = -3;
+ goto out;
+ }
+
+ irq = args->args[0];
+
+ server = priority = 0;
+ rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+ if (rc) {
+ rc = -3;
+ goto out;
+ }
+
+ args->rets[1] = server;
+ args->rets[2] = priority;
+out:
+ args->rets[0] = rc;
+}
+
+static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+ u32 irq;
+ int rc;
+
+ if (args->nargs != 1 || args->nret != 1) {
+ rc = -3;
+ goto out;
+ }
+
+ irq = args->args[0];
+
+ rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+ if (rc)
+ rc = -3;
+out:
+ args->rets[0] = rc;
+}
+
+static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+ u32 irq;
+ int rc;
+
+ if (args->nargs != 1 || args->nret != 1) {
+ rc = -3;
+ goto out;
+ }
+
+ irq = args->args[0];
+
+ rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+ if (rc)
+ rc = -3;
+out:
+ args->rets[0] = rc;
+}
+#endif /* CONFIG_KVM_XICS */
+
+struct rtas_handler {
+ void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
+ char *name;
+};
+
+static struct rtas_handler rtas_handlers[] = {
+#ifdef CONFIG_KVM_XICS
+ { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+ { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+ { .name = "ibm,int-off", .handler = kvm_rtas_int_off },
+ { .name = "ibm,int-on", .handler = kvm_rtas_int_on },
+#endif
+};
+
+struct rtas_token_definition {
+ struct list_head list;
+ struct rtas_handler *handler;
+ u64 token;
+};
+
+static int rtas_name_matches(char *s1, char *s2)
+{
+ struct kvm_rtas_token_args args;
+ return !strncmp(s1, s2, sizeof(args.name));
+}
+
+static int rtas_token_undefine(struct kvm *kvm, char *name)
+{
+ struct rtas_token_definition *d, *tmp;
+
+ lockdep_assert_held(&kvm->lock);
+
+ list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+ if (rtas_name_matches(d->handler->name, name)) {
+ list_del(&d->list);
+ kfree(d);
+ return 0;
+ }
+ }
+
+ /* It's not an error to undefine an undefined token */
+ return 0;
+}
+
+static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
+{
+ struct rtas_token_definition *d;
+ struct rtas_handler *h = NULL;
+ bool found;
+ int i;
+
+ lockdep_assert_held(&kvm->lock);
+
+ list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
+ if (d->token == token)
+ return -EEXIST;
+ }
+
+ found = false;
+ for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) {
+ h = &rtas_handlers[i];
+ if (rtas_name_matches(h->name, name)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return -ENOENT;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->handler = h;
+ d->token = token;
+
+ list_add_tail(&d->list, &kvm->arch.rtas_tokens);
+
+ return 0;
+}
+
+int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_rtas_token_args args;
+ int rc;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ if (args.token)
+ rc = rtas_token_define(kvm, args.name, args.token);
+ else
+ rc = rtas_token_undefine(kvm, args.name);
+
+ mutex_unlock(&kvm->lock);
+
+ return rc;
+}
+
+int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
+{
+ struct rtas_token_definition *d;
+ struct rtas_args args;
+ rtas_arg_t *orig_rets;
+ gpa_t args_phys;
+ int rc;
+
+ /* r4 contains the guest physical address of the RTAS args */
+ args_phys = kvmppc_get_gpr(vcpu, 4);
+
+ rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+ if (rc)
+ goto fail;
+
+ /*
+ * args->rets is a pointer into args->args. Now that we've
+ * copied args we need to fix it up to point into our copy,
+ * not the guest args. We also need to save the original
+ * value so we can restore it on the way out.
+ */
+ orig_rets = args.rets;
+ args.rets = &args.args[args.nargs];
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ rc = -ENOENT;
+ list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
+ if (d->token == args.token) {
+ d->handler->handler(vcpu, &args);
+ rc = 0;
+ break;
+ }
+ }
+
+ mutex_unlock(&vcpu->kvm->lock);
+
+ if (rc == 0) {
+ args.rets = orig_rets;
+ rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+ if (rc)
+ goto fail;
+ }
+
+ return rc;
+
+fail:
+ /*
+ * We only get here if the guest has called RTAS with a bogus
+ * args pointer. That means we can't get to the args, and so we
+ * can't fail the RTAS call. So fail right out to userspace,
+ * which should kill the guest.
+ */
+ return rc;
+}
+
+void kvmppc_rtas_tokens_free(struct kvm *kvm)
+{
+ struct rtas_token_definition *d, *tmp;
+
+ lockdep_assert_held(&kvm->lock);
+
+ list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+ list_del(&d->list);
+ kfree(d);
+ }
+}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 000000000000..ee841ed8a690
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1130 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#if 1
+#define XICS_DBG(fmt...) do { } while (0)
+#else
+#define XICS_DBG(fmt...) trace_printk(fmt)
+#endif
+
+#define ENABLE_REALMODE true
+#define DEBUG_REALMODE false
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ * ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ * locks array to improve scalability
+ *
+ * - ioctl's to save/restore the entire state for snapshot & migration
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+ u32 new_irq);
+
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+ struct ics_irq_state *state;
+ struct kvmppc_ics *ics;
+ u16 src;
+
+ XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics) {
+ XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+ return -EINVAL;
+ }
+ state = &ics->irq_state[src];
+ if (!state->exists)
+ return -EINVAL;
+
+ /*
+ * We set state->asserted locklessly. This should be fine as
+ * we are the only setter, thus concurrent access is undefined
+ * to begin with.
+ */
+ if (level == KVM_INTERRUPT_SET_LEVEL)
+ state->asserted = 1;
+ else if (level == KVM_INTERRUPT_UNSET) {
+ state->asserted = 0;
+ return 0;
+ }
+
+ /* Attempt delivery */
+ icp_deliver_irq(xics, NULL, irq);
+
+ return 0;
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+ struct kvmppc_icp *icp)
+{
+ int i;
+
+ mutex_lock(&ics->lock);
+
+ for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+ struct ics_irq_state *state = &ics->irq_state[i];
+
+ if (!state->resend)
+ continue;
+
+ XICS_DBG("resend %#x prio %#x\n", state->number,
+ state->priority);
+
+ mutex_unlock(&ics->lock);
+ icp_deliver_irq(xics, icp, state->number);
+ mutex_lock(&ics->lock);
+ }
+
+ mutex_unlock(&ics->lock);
+}
+
+static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+ struct ics_irq_state *state,
+ u32 server, u32 priority, u32 saved_priority)
+{
+ bool deliver;
+
+ mutex_lock(&ics->lock);
+
+ state->server = server;
+ state->priority = priority;
+ state->saved_priority = saved_priority;
+ deliver = false;
+ if ((state->masked_pending || state->resend) && priority != MASKED) {
+ state->masked_pending = 0;
+ deliver = true;
+ }
+
+ mutex_unlock(&ics->lock);
+
+ return deliver;
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_icp *icp;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
+ u16 src;
+
+ if (!xics)
+ return -ENODEV;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ return -EINVAL;
+ state = &ics->irq_state[src];
+
+ icp = kvmppc_xics_find_server(kvm, server);
+ if (!icp)
+ return -EINVAL;
+
+ XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+ irq, server, priority,
+ state->masked_pending, state->resend);
+
+ if (write_xive(xics, ics, state, server, priority, priority))
+ icp_deliver_irq(xics, icp, irq);
+
+ return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
+ u16 src;
+
+ if (!xics)
+ return -ENODEV;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ return -EINVAL;
+ state = &ics->irq_state[src];
+
+ mutex_lock(&ics->lock);
+ *server = state->server;
+ *priority = state->priority;
+ mutex_unlock(&ics->lock);
+
+ return 0;
+}
+
+int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_icp *icp;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
+ u16 src;
+
+ if (!xics)
+ return -ENODEV;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ return -EINVAL;
+ state = &ics->irq_state[src];
+
+ icp = kvmppc_xics_find_server(kvm, state->server);
+ if (!icp)
+ return -EINVAL;
+
+ if (write_xive(xics, ics, state, state->server, state->saved_priority,
+ state->saved_priority))
+ icp_deliver_irq(xics, icp, irq);
+
+ return 0;
+}
+
+int kvmppc_xics_int_off(struct kvm *kvm, u32 irq)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
+ u16 src;
+
+ if (!xics)
+ return -ENODEV;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ return -EINVAL;
+ state = &ics->irq_state[src];
+
+ write_xive(xics, ics, state, state->server, MASKED, state->priority);
+
+ return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+ union kvmppc_icp_state old,
+ union kvmppc_icp_state new,
+ bool change_self)
+{
+ bool success;
+
+ /* Calculate new output value */
+ new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+ /* Attempt atomic update */
+ success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+ if (!success)
+ goto bail;
+
+ XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+ icp->server_num,
+ old.cppr, old.mfrr, old.pending_pri, old.xisr,
+ old.need_resend, old.out_ee);
+ XICS_DBG("UPD - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+ new.cppr, new.mfrr, new.pending_pri, new.xisr,
+ new.need_resend, new.out_ee);
+ /*
+ * Check for output state update
+ *
+ * Note that this is racy since another processor could be updating
+ * the state already. This is why we never clear the interrupt output
+ * here, we only ever set it. The clear only happens prior to doing
+ * an update and only by the processor itself. Currently we do it
+ * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+ *
+ * We also do not try to figure out whether the EE state has changed,
+ * we unconditionally set it if the new state calls for it. The reason
+ * for that is that we opportunistically remove the pending interrupt
+ * flag when raising CPPR, so we need to set it back here if an
+ * interrupt is still pending.
+ */
+ if (new.out_ee) {
+ kvmppc_book3s_queue_irqprio(icp->vcpu,
+ BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+ if (!change_self)
+ kvmppc_fast_vcpu_kick(icp->vcpu);
+ }
+ bail:
+ return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+ struct kvmppc_icp *icp)
+{
+ u32 icsid;
+
+ /* Order this load with the test for need_resend in the caller */
+ smp_rmb();
+ for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+ struct kvmppc_ics *ics = xics->ics[icsid];
+
+ if (!test_and_clear_bit(icsid, icp->resend_map))
+ continue;
+ if (!ics)
+ continue;
+ ics_check_resend(xics, ics, icp);
+ }
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+ u32 *reject)
+{
+ union kvmppc_icp_state old_state, new_state;
+ bool success;
+
+ XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+ icp->server_num);
+
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ *reject = 0;
+
+ /* See if we can deliver */
+ success = new_state.cppr > priority &&
+ new_state.mfrr > priority &&
+ new_state.pending_pri > priority;
+
+ /*
+ * If we can, check for a rejection and perform the
+ * delivery
+ */
+ if (success) {
+ *reject = new_state.xisr;
+ new_state.xisr = irq;
+ new_state.pending_pri = priority;
+ } else {
+ /*
+ * If we failed to deliver we set need_resend
+ * so a subsequent CPPR state change causes us
+ * to try a new delivery.
+ */
+ new_state.need_resend = true;
+ }
+
+ } while (!icp_try_update(icp, old_state, new_state, false));
+
+ return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+ u32 new_irq)
+{
+ struct ics_irq_state *state;
+ struct kvmppc_ics *ics;
+ u32 reject;
+ u16 src;
+
+ /*
+ * This is used both for initial delivery of an interrupt and
+ * for subsequent rejection.
+ *
+ * Rejection can be racy vs. resends. We have evaluated the
+ * rejection in an atomic ICP transaction which is now complete,
+ * so potentially the ICP can already accept the interrupt again.
+ *
+ * So we need to retry the delivery. Essentially the reject path
+ * boils down to a failed delivery. Always.
+ *
+ * Now the interrupt could also have moved to a different target,
+ * thus we may need to re-do the ICP lookup as well
+ */
+
+ again:
+ /* Get the ICS state and lock it */
+ ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+ if (!ics) {
+ XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+ return;
+ }
+ state = &ics->irq_state[src];
+
+ /* Get a lock on the ICS */
+ mutex_lock(&ics->lock);
+
+ /* Get our server */
+ if (!icp || state->server != icp->server_num) {
+ icp = kvmppc_xics_find_server(xics->kvm, state->server);
+ if (!icp) {
+ pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n",
+ new_irq, state->server);
+ goto out;
+ }
+ }
+
+ /* Clear the resend bit of that interrupt */
+ state->resend = 0;
+
+ /*
+ * If masked, bail out
+ *
+ * Note: PAPR doesn't mention anything about masked pending
+ * when doing a resend, only when doing a delivery.
+ *
+ * However that would have the effect of losing a masked
+ * interrupt that was rejected and isn't consistent with
+ * the whole masked_pending business which is about not
+ * losing interrupts that occur while masked.
+ *
+ * I don't differenciate normal deliveries and resends, this
+ * implementation will differ from PAPR and not lose such
+ * interrupts.
+ */
+ if (state->priority == MASKED) {
+ XICS_DBG("irq %#x masked pending\n", new_irq);
+ state->masked_pending = 1;
+ goto out;
+ }
+
+ /*
+ * Try the delivery, this will set the need_resend flag
+ * in the ICP as part of the atomic transaction if the
+ * delivery is not possible.
+ *
+ * Note that if successful, the new delivery might have itself
+ * rejected an interrupt that was "delivered" before we took the
+ * icp mutex.
+ *
+ * In this case we do the whole sequence all over again for the
+ * new guy. We cannot assume that the rejected interrupt is less
+ * favored than the new one, and thus doesn't need to be delivered,
+ * because by the time we exit icp_try_to_deliver() the target
+ * processor may well have alrady consumed & completed it, and thus
+ * the rejected interrupt might actually be already acceptable.
+ */
+ if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+ /*
+ * Delivery was successful, did we reject somebody else ?
+ */
+ if (reject && reject != XICS_IPI) {
+ mutex_unlock(&ics->lock);
+ new_irq = reject;
+ goto again;
+ }
+ } else {
+ /*
+ * We failed to deliver the interrupt we need to set the
+ * resend map bit and mark the ICS state as needing a resend
+ */
+ set_bit(ics->icsid, icp->resend_map);
+ state->resend = 1;
+
+ /*
+ * If the need_resend flag got cleared in the ICP some time
+ * between icp_try_to_deliver() atomic update and now, then
+ * we know it might have missed the resend_map bit. So we
+ * retry
+ */
+ smp_mb();
+ if (!icp->state.need_resend) {
+ mutex_unlock(&ics->lock);
+ goto again;
+ }
+ }
+ out:
+ mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+ u8 new_cppr)
+{
+ union kvmppc_icp_state old_state, new_state;
+ bool resend;
+
+ /*
+ * This handles several related states in one operation:
+ *
+ * ICP State: Down_CPPR
+ *
+ * Load CPPR with new value and if the XISR is 0
+ * then check for resends:
+ *
+ * ICP State: Resend
+ *
+ * If MFRR is more favored than CPPR, check for IPIs
+ * and notify ICS of a potential resend. This is done
+ * asynchronously (when used in real mode, we will have
+ * to exit here).
+ *
+ * We do not handle the complete Check_IPI as documented
+ * here. In the PAPR, this state will be used for both
+ * Set_MFRR and Down_CPPR. However, we know that we aren't
+ * changing the MFRR state here so we don't need to handle
+ * the case of an MFRR causing a reject of a pending irq,
+ * this will have been handled when the MFRR was set in the
+ * first place.
+ *
+ * Thus we don't have to handle rejects, only resends.
+ *
+ * When implementing real mode for HV KVM, resend will lead to
+ * a H_TOO_HARD return and the whole transaction will be handled
+ * in virtual mode.
+ */
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ /* Down_CPPR */
+ new_state.cppr = new_cppr;
+
+ /*
+ * Cut down Resend / Check_IPI / IPI
+ *
+ * The logic is that we cannot have a pending interrupt
+ * trumped by an IPI at this point (see above), so we
+ * know that either the pending interrupt is already an
+ * IPI (in which case we don't care to override it) or
+ * it's either more favored than us or non existent
+ */
+ if (new_state.mfrr < new_cppr &&
+ new_state.mfrr <= new_state.pending_pri) {
+ WARN_ON(new_state.xisr != XICS_IPI &&
+ new_state.xisr != 0);
+ new_state.pending_pri = new_state.mfrr;
+ new_state.xisr = XICS_IPI;
+ }
+
+ /* Latch/clear resend bit */
+ resend = new_state.need_resend;
+ new_state.need_resend = 0;
+
+ } while (!icp_try_update(icp, old_state, new_state, true));
+
+ /*
+ * Now handle resend checks. Those are asynchronous to the ICP
+ * state update in HW (ie bus transactions) so we can handle them
+ * separately here too
+ */
+ if (resend)
+ icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
+{
+ union kvmppc_icp_state old_state, new_state;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 xirr;
+
+ /* First, remove EE from the processor */
+ kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+ BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+ /*
+ * ICP State: Accept_Interrupt
+ *
+ * Return the pending interrupt (if any) along with the
+ * current CPPR, then clear the XISR & set CPPR to the
+ * pending priority
+ */
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+ if (!old_state.xisr)
+ break;
+ new_state.cppr = new_state.pending_pri;
+ new_state.pending_pri = 0xff;
+ new_state.xisr = 0;
+
+ } while (!icp_try_update(icp, old_state, new_state, true));
+
+ XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+ return xirr;
+}
+
+static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+ unsigned long mfrr)
+{
+ union kvmppc_icp_state old_state, new_state;
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp;
+ u32 reject;
+ bool resend;
+ bool local;
+
+ XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+ vcpu->vcpu_id, server, mfrr);
+
+ icp = vcpu->arch.icp;
+ local = icp->server_num == server;
+ if (!local) {
+ icp = kvmppc_xics_find_server(vcpu->kvm, server);
+ if (!icp)
+ return H_PARAMETER;
+ }
+
+ /*
+ * ICP state: Set_MFRR
+ *
+ * If the CPPR is more favored than the new MFRR, then
+ * nothing needs to be rejected as there can be no XISR to
+ * reject. If the MFRR is being made less favored then
+ * there might be a previously-rejected interrupt needing
+ * to be resent.
+ *
+ * If the CPPR is less favored, then we might be replacing
+ * an interrupt, and thus need to possibly reject it as in
+ *
+ * ICP state: Check_IPI
+ */
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ /* Set_MFRR */
+ new_state.mfrr = mfrr;
+
+ /* Check_IPI */
+ reject = 0;
+ resend = false;
+ if (mfrr < new_state.cppr) {
+ /* Reject a pending interrupt if not an IPI */
+ if (mfrr <= new_state.pending_pri)
+ reject = new_state.xisr;
+ new_state.pending_pri = mfrr;
+ new_state.xisr = XICS_IPI;
+ }
+
+ if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+ resend = new_state.need_resend;
+ new_state.need_resend = 0;
+ }
+ } while (!icp_try_update(icp, old_state, new_state, local));
+
+ /* Handle reject */
+ if (reject && reject != XICS_IPI)
+ icp_deliver_irq(xics, icp, reject);
+
+ /* Handle resend */
+ if (resend)
+ icp_check_resend(xics, icp);
+
+ return H_SUCCESS;
+}
+
+static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+ union kvmppc_icp_state old_state, new_state;
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 reject;
+
+ XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+ /*
+ * ICP State: Set_CPPR
+ *
+ * We can safely compare the new value with the current
+ * value outside of the transaction as the CPPR is only
+ * ever changed by the processor on itself
+ */
+ if (cppr > icp->state.cppr)
+ icp_down_cppr(xics, icp, cppr);
+ else if (cppr == icp->state.cppr)
+ return;
+
+ /*
+ * ICP State: Up_CPPR
+ *
+ * The processor is raising its priority, this can result
+ * in a rejection of a pending interrupt:
+ *
+ * ICP State: Reject_Current
+ *
+ * We can remove EE from the current processor, the update
+ * transaction will set it again if needed
+ */
+ kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+ BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+ do {
+ old_state = new_state = ACCESS_ONCE(icp->state);
+
+ reject = 0;
+ new_state.cppr = cppr;
+
+ if (cppr <= new_state.pending_pri) {
+ reject = new_state.xisr;
+ new_state.xisr = 0;
+ new_state.pending_pri = 0xff;
+ }
+
+ } while (!icp_try_update(icp, old_state, new_state, true));
+
+ /*
+ * Check for rejects. They are handled by doing a new delivery
+ * attempt (see comments in icp_deliver_irq).
+ */
+ if (reject && reject != XICS_IPI)
+ icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
+ u32 irq = xirr & 0x00ffffff;
+ u16 src;
+
+ XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+ /*
+ * ICP State: EOI
+ *
+ * Note: If EOI is incorrectly used by SW to lower the CPPR
+ * value (ie more favored), we do not check for rejection of
+ * a pending interrupt, this is a SW error and PAPR sepcifies
+ * that we don't have to deal with it.
+ *
+ * The sending of an EOI to the ICS is handled after the
+ * CPPR update
+ *
+ * ICP State: Down_CPPR which we handle
+ * in a separate function as it's shared with H_CPPR.
+ */
+ icp_down_cppr(xics, icp, xirr >> 24);
+
+ /* IPIs have no EOI */
+ if (irq == XICS_IPI)
+ return H_SUCCESS;
+ /*
+ * EOI handling: If the interrupt is still asserted, we need to
+ * resend it. We can take a lockless "peek" at the ICS state here.
+ *
+ * "Message" interrupts will never have "asserted" set
+ */
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics) {
+ XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+ return H_PARAMETER;
+ }
+ state = &ics->irq_state[src];
+
+ /* Still asserted, resend it */
+ if (state->asserted)
+ icp_deliver_irq(xics, icp, irq);
+
+ return H_SUCCESS;
+}
+
+static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+
+ XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
+ hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
+
+ if (icp->rm_action & XICS_RM_KICK_VCPU)
+ kvmppc_fast_vcpu_kick(icp->rm_kick_target);
+ if (icp->rm_action & XICS_RM_CHECK_RESEND)
+ icp_check_resend(xics, icp);
+ if (icp->rm_action & XICS_RM_REJECT)
+ icp_deliver_irq(xics, icp, icp->rm_reject);
+
+ icp->rm_action = 0;
+
+ return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ unsigned long res;
+ int rc = H_SUCCESS;
+
+ /* Check if we have an ICP */
+ if (!xics || !vcpu->arch.icp)
+ return H_HARDWARE;
+
+ /* Check for real mode returning too hard */
+ if (xics->real_mode)
+ return kvmppc_xics_rm_complete(vcpu, req);
+
+ switch (req) {
+ case H_XIRR:
+ res = kvmppc_h_xirr(vcpu);
+ kvmppc_set_gpr(vcpu, 4, res);
+ break;
+ case H_CPPR:
+ kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+ break;
+ case H_EOI:
+ rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+ break;
+ case H_IPI:
+ rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5));
+ break;
+ }
+
+ return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+ struct kvmppc_xics *xics = m->private;
+ struct kvm *kvm = xics->kvm;
+ struct kvm_vcpu *vcpu;
+ int icsid, i;
+
+ if (!kvm)
+ return 0;
+
+ seq_printf(m, "=========\nICP state\n=========\n");
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ union kvmppc_icp_state state;
+
+ if (!icp)
+ continue;
+
+ state.raw = ACCESS_ONCE(icp->state.raw);
+ seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n",
+ icp->server_num, state.xisr,
+ state.pending_pri, state.cppr, state.mfrr,
+ state.out_ee, state.need_resend);
+ }
+
+ for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+ struct kvmppc_ics *ics = xics->ics[icsid];
+
+ if (!ics)
+ continue;
+
+ seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+ icsid);
+
+ mutex_lock(&ics->lock);
+
+ for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+ struct ics_irq_state *irq = &ics->irq_state[i];
+
+ seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+ irq->number, irq->server, irq->priority,
+ irq->saved_priority, irq->asserted,
+ irq->resend, irq->masked_pending);
+
+ }
+ mutex_unlock(&ics->lock);
+ }
+ return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+ .open = xics_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+ char *name;
+
+ name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+ if (!name) {
+ pr_err("%s: no memory for name\n", __func__);
+ return;
+ }
+
+ xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+ xics, &xics_debug_fops);
+
+ pr_debug("%s: created %s\n", __func__, name);
+ kfree(name);
+}
+
+struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+ struct kvmppc_xics *xics, int irq)
+{
+ struct kvmppc_ics *ics;
+ int i, icsid;
+
+ icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+ mutex_lock(&kvm->lock);
+
+ /* ICS already exists - somebody else got here first */
+ if (xics->ics[icsid])
+ goto out;
+
+ /* Create the ICS */
+ ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+ if (!ics)
+ goto out;
+
+ mutex_init(&ics->lock);
+ ics->icsid = icsid;
+
+ for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+ ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+ ics->irq_state[i].priority = MASKED;
+ ics->irq_state[i].saved_priority = MASKED;
+ }
+ smp_wmb();
+ xics->ics[icsid] = ics;
+
+ if (icsid > xics->max_icsid)
+ xics->max_icsid = icsid;
+
+ out:
+ mutex_unlock(&kvm->lock);
+ return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+{
+ struct kvmppc_icp *icp;
+
+ if (!vcpu->kvm->arch.xics)
+ return -ENODEV;
+
+ if (kvmppc_xics_find_server(vcpu->kvm, server_num))
+ return -EEXIST;
+
+ icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+ if (!icp)
+ return -ENOMEM;
+
+ icp->vcpu = vcpu;
+ icp->server_num = server_num;
+ icp->state.mfrr = MASKED;
+ icp->state.pending_pri = MASKED;
+ vcpu->arch.icp = icp;
+
+ XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+ return 0;
+}
+
+u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ union kvmppc_icp_state state;
+
+ if (!icp)
+ return 0;
+ state = icp->state;
+ return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) |
+ ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) |
+ ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) |
+ ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT);
+}
+
+int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ union kvmppc_icp_state old_state, new_state;
+ struct kvmppc_ics *ics;
+ u8 cppr, mfrr, pending_pri;
+ u32 xisr;
+ u16 src;
+ bool resend;
+
+ if (!icp || !xics)
+ return -ENOENT;
+
+ cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+ xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+ KVM_REG_PPC_ICP_XISR_MASK;
+ mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+ pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT;
+
+ /* Require the new state to be internally consistent */
+ if (xisr == 0) {
+ if (pending_pri != 0xff)
+ return -EINVAL;
+ } else if (xisr == XICS_IPI) {
+ if (pending_pri != mfrr || pending_pri >= cppr)
+ return -EINVAL;
+ } else {
+ if (pending_pri >= mfrr || pending_pri >= cppr)
+ return -EINVAL;
+ ics = kvmppc_xics_find_ics(xics, xisr, &src);
+ if (!ics)
+ return -EINVAL;
+ }
+
+ new_state.raw = 0;
+ new_state.cppr = cppr;
+ new_state.xisr = xisr;
+ new_state.mfrr = mfrr;
+ new_state.pending_pri = pending_pri;
+
+ /*
+ * Deassert the CPU interrupt request.
+ * icp_try_update will reassert it if necessary.
+ */
+ kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+ BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+ /*
+ * Note that if we displace an interrupt from old_state.xisr,
+ * we don't mark it as rejected. We expect userspace to set
+ * the state of the interrupt sources to be consistent with
+ * the ICP states (either before or afterwards, which doesn't
+ * matter). We do handle resends due to CPPR becoming less
+ * favoured because that is necessary to end up with a
+ * consistent state in the situation where userspace restores
+ * the ICS states before the ICP states.
+ */
+ do {
+ old_state = ACCESS_ONCE(icp->state);
+
+ if (new_state.mfrr <= old_state.mfrr) {
+ resend = false;
+ new_state.need_resend = old_state.need_resend;
+ } else {
+ resend = old_state.need_resend;
+ new_state.need_resend = 0;
+ }
+ } while (!icp_try_update(icp, old_state, new_state, false));
+
+ if (resend)
+ icp_check_resend(xics, icp);
+
+ return 0;
+}
+
+/* -- ioctls -- */
+
+int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
+{
+ struct kvmppc_xics *xics;
+ int r;
+
+ /* locking against multiple callers? */
+
+ xics = kvm->arch.xics;
+ if (!xics)
+ return -ENODEV;
+
+ switch (args->level) {
+ case KVM_INTERRUPT_SET:
+ case KVM_INTERRUPT_SET_LEVEL:
+ case KVM_INTERRUPT_UNSET:
+ r = ics_deliver_irq(xics, args->irq, args->level);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+void kvmppc_xics_free(struct kvmppc_xics *xics)
+{
+ int i;
+ struct kvm *kvm = xics->kvm;
+
+ debugfs_remove(xics->dentry);
+
+ if (kvm)
+ kvm->arch.xics = NULL;
+
+ for (i = 0; i <= xics->max_icsid; i++)
+ kfree(xics->ics[i]);
+ kfree(xics);
+}
+
+int kvm_xics_create(struct kvm *kvm, u32 type)
+{
+ struct kvmppc_xics *xics;
+ int ret = 0;
+
+ xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+ if (!xics)
+ return -ENOMEM;
+
+ xics->kvm = kvm;
+
+ /* Already there ? */
+ mutex_lock(&kvm->lock);
+ if (kvm->arch.xics)
+ ret = -EEXIST;
+ else
+ kvm->arch.xics = xics;
+ mutex_unlock(&kvm->lock);
+
+ if (ret)
+ return ret;
+
+ xics_debugfs_init(xics);
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+ if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+ /* Enable real mode support */
+ xics->real_mode = ENABLE_REALMODE;
+ xics->real_mode_dbg = DEBUG_REALMODE;
+ }
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
+ return 0;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.icp)
+ return;
+ kfree(vcpu->arch.icp);
+ vcpu->arch.icp = NULL;
+ vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 000000000000..e4fdec3dde77
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID 1023
+#define KVMPPC_XICS_ICS_SHIFT 10
+#define KVMPPC_XICS_IRQ_PER_ICS (1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK (KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ 16
+#define KVMPPC_XICS_NR_IRQS ((KVMPPC_XICS_MAX_ICS_ID + 1) * \
+ KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED 0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+ u32 number;
+ u32 server;
+ u8 priority;
+ u8 saved_priority;
+ u8 resend;
+ u8 masked_pending;
+ u8 asserted; /* Only for LSI */
+ u8 exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+ unsigned long raw;
+ struct {
+ u8 out_ee:1;
+ u8 need_resend:1;
+ u8 cppr;
+ u8 mfrr;
+ u8 pending_pri;
+ u32 xisr;
+ };
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+ struct kvm_vcpu *vcpu;
+ unsigned long server_num;
+ union kvmppc_icp_state state;
+ unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+
+ /* Real mode might find something too hard, here's the action
+ * it might request from virtual mode
+ */
+#define XICS_RM_KICK_VCPU 0x1
+#define XICS_RM_CHECK_RESEND 0x2
+#define XICS_RM_REJECT 0x4
+ u32 rm_action;
+ struct kvm_vcpu *rm_kick_target;
+ u32 rm_reject;
+
+ /* Debug stuff for real mode */
+ union kvmppc_icp_state rm_dbgstate;
+ struct kvm_vcpu *rm_dbgtgt;
+};
+
+struct kvmppc_ics {
+ struct mutex lock;
+ u16 icsid;
+ struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+ struct kvm *kvm;
+ struct dentry *dentry;
+ u32 max_icsid;
+ bool real_mode;
+ bool real_mode_dbg;
+ struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+ u32 nr)
+{
+ struct kvm_vcpu *vcpu = NULL;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
+ return vcpu->arch.icp;
+ }
+ return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+ u32 irq, u16 *source)
+{
+ u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+ u16 src = irq & KVMPPC_XICS_SRC_MASK;
+ struct kvmppc_ics *ics;
+
+ if (source)
+ *source = src;
+ if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+ return NULL;
+ ics = xics->ics[icsid];
+ if (!ics)
+ return NULL;
+ return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a49a68a25c39..1020119226db 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -346,7 +346,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
keep_irq = true;
}
- if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
+ if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
update_epr = true;
switch (priority) {
@@ -427,8 +427,14 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
set_guest_esr(vcpu, vcpu->arch.queued_esr);
if (update_dear == true)
set_guest_dear(vcpu, vcpu->arch.queued_dear);
- if (update_epr == true)
- kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+ if (update_epr == true) {
+ if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
+ kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+ else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) {
+ BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC);
+ kvmppc_mpic_set_epr(vcpu);
+ }
+ }
new_msr &= msr_mask;
#if defined(CONFIG_64BIT)
@@ -745,6 +751,9 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvmppc_core_queue_program(vcpu, ESR_PIL);
return RESUME_HOST;
+ case EMULATE_EXIT_USER:
+ return RESUME_HOST;
+
default:
BUG();
}
@@ -1412,120 +1421,134 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
{
- int r = -EINVAL;
+ int r = 0;
+ union kvmppc_one_reg val;
+ int size;
+ long int i;
+
+ size = one_reg_size(reg->id);
+ if (size > sizeof(val))
+ return -EINVAL;
switch (reg->id) {
case KVM_REG_PPC_IAC1:
case KVM_REG_PPC_IAC2:
case KVM_REG_PPC_IAC3:
- case KVM_REG_PPC_IAC4: {
- int iac = reg->id - KVM_REG_PPC_IAC1;
- r = copy_to_user((u64 __user *)(long)reg->addr,
- &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
+ case KVM_REG_PPC_IAC4:
+ i = reg->id - KVM_REG_PPC_IAC1;
+ val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]);
break;
- }
case KVM_REG_PPC_DAC1:
- case KVM_REG_PPC_DAC2: {
- int dac = reg->id - KVM_REG_PPC_DAC1;
- r = copy_to_user((u64 __user *)(long)reg->addr,
- &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
+ case KVM_REG_PPC_DAC2:
+ i = reg->id - KVM_REG_PPC_DAC1;
+ val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]);
break;
- }
case KVM_REG_PPC_EPR: {
u32 epr = get_guest_epr(vcpu);
- r = put_user(epr, (u32 __user *)(long)reg->addr);
+ val = get_reg_val(reg->id, epr);
break;
}
#if defined(CONFIG_64BIT)
case KVM_REG_PPC_EPCR:
- r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
+ val = get_reg_val(reg->id, vcpu->arch.epcr);
break;
#endif
case KVM_REG_PPC_TCR:
- r = put_user(vcpu->arch.tcr, (u32 __user *)(long)reg->addr);
+ val = get_reg_val(reg->id, vcpu->arch.tcr);
break;
case KVM_REG_PPC_TSR:
- r = put_user(vcpu->arch.tsr, (u32 __user *)(long)reg->addr);
+ val = get_reg_val(reg->id, vcpu->arch.tsr);
break;
- case KVM_REG_PPC_DEBUG_INST: {
- u32 opcode = KVMPPC_INST_EHPRIV;
- r = copy_to_user((u32 __user *)(long)reg->addr,
- &opcode, sizeof(u32));
+ case KVM_REG_PPC_DEBUG_INST:
+ val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV);
break;
- }
default:
+ r = kvmppc_get_one_reg(vcpu, reg->id, &val);
break;
}
+
+ if (r)
+ return r;
+
+ if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
+ r = -EFAULT;
+
return r;
}
int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
{
- int r = -EINVAL;
+ int r = 0;
+ union kvmppc_one_reg val;
+ int size;
+ long int i;
+
+ size = one_reg_size(reg->id);
+ if (size > sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
+ return -EFAULT;
switch (reg->id) {
case KVM_REG_PPC_IAC1:
case KVM_REG_PPC_IAC2:
case KVM_REG_PPC_IAC3:
- case KVM_REG_PPC_IAC4: {
- int iac = reg->id - KVM_REG_PPC_IAC1;
- r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac],
- (u64 __user *)(long)reg->addr, sizeof(u64));
+ case KVM_REG_PPC_IAC4:
+ i = reg->id - KVM_REG_PPC_IAC1;
+ vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val);
break;
- }
case KVM_REG_PPC_DAC1:
- case KVM_REG_PPC_DAC2: {
- int dac = reg->id - KVM_REG_PPC_DAC1;
- r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac],
- (u64 __user *)(long)reg->addr, sizeof(u64));
+ case KVM_REG_PPC_DAC2:
+ i = reg->id - KVM_REG_PPC_DAC1;
+ vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val);
break;
- }
case KVM_REG_PPC_EPR: {
- u32 new_epr;
- r = get_user(new_epr, (u32 __user *)(long)reg->addr);
- if (!r)
- kvmppc_set_epr(vcpu, new_epr);
+ u32 new_epr = set_reg_val(reg->id, val);
+ kvmppc_set_epr(vcpu, new_epr);
break;
}
#if defined(CONFIG_64BIT)
case KVM_REG_PPC_EPCR: {
- u32 new_epcr;
- r = get_user(new_epcr, (u32 __user *)(long)reg->addr);
- if (r == 0)
- kvmppc_set_epcr(vcpu, new_epcr);
+ u32 new_epcr = set_reg_val(reg->id, val);
+ kvmppc_set_epcr(vcpu, new_epcr);
break;
}
#endif
case KVM_REG_PPC_OR_TSR: {
- u32 tsr_bits;
- r = get_user(tsr_bits, (u32 __user *)(long)reg->addr);
+ u32 tsr_bits = set_reg_val(reg->id, val);
kvmppc_set_tsr_bits(vcpu, tsr_bits);
break;
}
case KVM_REG_PPC_CLEAR_TSR: {
- u32 tsr_bits;
- r = get_user(tsr_bits, (u32 __user *)(long)reg->addr);
+ u32 tsr_bits = set_reg_val(reg->id, val);
kvmppc_clr_tsr_bits(vcpu, tsr_bits);
break;
}
case KVM_REG_PPC_TSR: {
- u32 tsr;
- r = get_user(tsr, (u32 __user *)(long)reg->addr);
+ u32 tsr = set_reg_val(reg->id, val);
kvmppc_set_tsr(vcpu, tsr);
break;
}
case KVM_REG_PPC_TCR: {
- u32 tcr;
- r = get_user(tcr, (u32 __user *)(long)reg->addr);
+ u32 tcr = set_reg_val(reg->id, val);
kvmppc_set_tcr(vcpu, tcr);
break;
}
default:
+ r = kvmppc_set_one_reg(vcpu, reg->id, &val);
break;
}
+
return r;
}
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ return -EINVAL;
+}
+
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
return -ENOTSUPP;
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 6dd4de7802bf..ce6b73c29612 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -425,6 +425,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return kvmppc_set_sregs_ivor(vcpu, sregs);
}
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+ return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+ return r;
+}
+
struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
{
struct kvmppc_vcpu_e500 *vcpu_e500;
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 33db48a8ce24..c2e5e98453a6 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -23,6 +23,10 @@
#include <asm/mmu-book3e.h>
#include <asm/tlb.h>
+enum vcpu_ftr {
+ VCPU_FTR_MMU_V2
+};
+
#define E500_PID_NUM 3
#define E500_TLB_NUM 2
@@ -131,6 +135,10 @@ void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val);
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val);
#ifdef CONFIG_KVM_E500V2
unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -295,4 +303,18 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
#define get_tlb_sts(gtlbe) (MAS1_TS)
#endif /* !BOOKE_HV */
+static inline bool has_feature(const struct kvm_vcpu *vcpu,
+ enum vcpu_ftr ftr)
+{
+ bool has_ftr;
+ switch (ftr) {
+ case VCPU_FTR_MMU_V2:
+ has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2);
+ break;
+ default:
+ return false;
+ }
+ return has_ftr;
+}
+
#endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index e78f353a836a..b10a01243abd 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -284,6 +284,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
case SPRN_TLB1CFG:
*spr_val = vcpu->arch.tlbcfg[1];
break;
+ case SPRN_TLB0PS:
+ if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+ return EMULATE_FAIL;
+ *spr_val = vcpu->arch.tlbps[0];
+ break;
+ case SPRN_TLB1PS:
+ if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+ return EMULATE_FAIL;
+ *spr_val = vcpu->arch.tlbps[1];
+ break;
case SPRN_L1CSR0:
*spr_val = vcpu_e500->l1csr0;
break;
@@ -307,6 +317,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
case SPRN_MMUCFG:
*spr_val = vcpu->arch.mmucfg;
break;
+ case SPRN_EPTCFG:
+ if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+ return EMULATE_FAIL;
+ /*
+ * Legacy Linux guests access EPTCFG register even if the E.PT
+ * category is disabled in the VM. Give them a chance to live.
+ */
+ *spr_val = vcpu->arch.eptcfg;
+ break;
/* extra exceptions */
case SPRN_IVOR32:
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 5c4475983f78..c41a5a96b558 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -596,6 +596,140 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return 0;
}
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = 0;
+ long int i;
+
+ switch (id) {
+ case KVM_REG_PPC_MAS0:
+ *val = get_reg_val(id, vcpu->arch.shared->mas0);
+ break;
+ case KVM_REG_PPC_MAS1:
+ *val = get_reg_val(id, vcpu->arch.shared->mas1);
+ break;
+ case KVM_REG_PPC_MAS2:
+ *val = get_reg_val(id, vcpu->arch.shared->mas2);
+ break;
+ case KVM_REG_PPC_MAS7_3:
+ *val = get_reg_val(id, vcpu->arch.shared->mas7_3);
+ break;
+ case KVM_REG_PPC_MAS4:
+ *val = get_reg_val(id, vcpu->arch.shared->mas4);
+ break;
+ case KVM_REG_PPC_MAS6:
+ *val = get_reg_val(id, vcpu->arch.shared->mas6);
+ break;
+ case KVM_REG_PPC_MMUCFG:
+ *val = get_reg_val(id, vcpu->arch.mmucfg);
+ break;
+ case KVM_REG_PPC_EPTCFG:
+ *val = get_reg_val(id, vcpu->arch.eptcfg);
+ break;
+ case KVM_REG_PPC_TLB0CFG:
+ case KVM_REG_PPC_TLB1CFG:
+ case KVM_REG_PPC_TLB2CFG:
+ case KVM_REG_PPC_TLB3CFG:
+ i = id - KVM_REG_PPC_TLB0CFG;
+ *val = get_reg_val(id, vcpu->arch.tlbcfg[i]);
+ break;
+ case KVM_REG_PPC_TLB0PS:
+ case KVM_REG_PPC_TLB1PS:
+ case KVM_REG_PPC_TLB2PS:
+ case KVM_REG_PPC_TLB3PS:
+ i = id - KVM_REG_PPC_TLB0PS;
+ *val = get_reg_val(id, vcpu->arch.tlbps[i]);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = 0;
+ long int i;
+
+ switch (id) {
+ case KVM_REG_PPC_MAS0:
+ vcpu->arch.shared->mas0 = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_MAS1:
+ vcpu->arch.shared->mas1 = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_MAS2:
+ vcpu->arch.shared->mas2 = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_MAS7_3:
+ vcpu->arch.shared->mas7_3 = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_MAS4:
+ vcpu->arch.shared->mas4 = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_MAS6:
+ vcpu->arch.shared->mas6 = set_reg_val(id, *val);
+ break;
+ /* Only allow MMU registers to be set to the config supported by KVM */
+ case KVM_REG_PPC_MMUCFG: {
+ u32 reg = set_reg_val(id, *val);
+ if (reg != vcpu->arch.mmucfg)
+ r = -EINVAL;
+ break;
+ }
+ case KVM_REG_PPC_EPTCFG: {
+ u32 reg = set_reg_val(id, *val);
+ if (reg != vcpu->arch.eptcfg)
+ r = -EINVAL;
+ break;
+ }
+ case KVM_REG_PPC_TLB0CFG:
+ case KVM_REG_PPC_TLB1CFG:
+ case KVM_REG_PPC_TLB2CFG:
+ case KVM_REG_PPC_TLB3CFG: {
+ /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */
+ u32 reg = set_reg_val(id, *val);
+ i = id - KVM_REG_PPC_TLB0CFG;
+ if (reg != vcpu->arch.tlbcfg[i])
+ r = -EINVAL;
+ break;
+ }
+ case KVM_REG_PPC_TLB0PS:
+ case KVM_REG_PPC_TLB1PS:
+ case KVM_REG_PPC_TLB2PS:
+ case KVM_REG_PPC_TLB3PS: {
+ u32 reg = set_reg_val(id, *val);
+ i = id - KVM_REG_PPC_TLB0PS;
+ if (reg != vcpu->arch.tlbps[i])
+ r = -EINVAL;
+ break;
+ }
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
+static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu,
+ struct kvm_book3e_206_tlb_params *params)
+{
+ vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+ if (params->tlb_sizes[0] <= 2048)
+ vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0];
+ vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+
+ vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+ vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1];
+ vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+ return 0;
+}
+
int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
struct kvm_config_tlb *cfg)
{
@@ -692,16 +826,8 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
vcpu_e500->gtlb_offset[0] = 0;
vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
- vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
-
- vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
- if (params.tlb_sizes[0] <= 2048)
- vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0];
- vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
-
- vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
- vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1];
- vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+ /* Update vcpu's MMU geometry based on SW_TLB input */
+ vcpu_mmu_geometry_update(vcpu, &params);
vcpu_e500->shared_tlb_pages = pages;
vcpu_e500->num_shared_tlb_pages = num_pages;
@@ -737,6 +863,39 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
return 0;
}
+/* Vcpu's MMU default configuration */
+static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
+ struct kvmppc_e500_tlb_params *params)
+{
+ /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/
+ vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
+
+ /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/
+ vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
+ ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+ vcpu->arch.tlbcfg[0] |= params[0].entries;
+ vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT;
+
+ vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
+ ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+ vcpu->arch.tlbcfg[1] |= params[1].entries;
+ vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT;
+
+ if (has_feature(vcpu, VCPU_FTR_MMU_V2)) {
+ vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS);
+ vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS);
+
+ vcpu->arch.mmucfg &= ~MMUCFG_LRAT;
+
+ /* Guest mmu emulation currently doesn't handle E.PT */
+ vcpu->arch.eptcfg = 0;
+ vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT;
+ vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND;
+ }
+
+ return 0;
+}
+
int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
{
struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
@@ -781,18 +940,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
if (!vcpu_e500->g2h_tlb1_map)
goto err;
- /* Init TLB configuration register */
- vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
- ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
- vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries;
- vcpu->arch.tlbcfg[0] |=
- vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
-
- vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
- ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
- vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries;
- vcpu->arch.tlbcfg[1] |=
- vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
+ vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
kvmppc_recalc_tlb1map_range(vcpu_e500);
return 0;
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index 1f89d26e65fb..c3bdc0aeabe2 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -172,6 +172,8 @@ int kvmppc_core_check_processor_compat(void)
r = 0;
else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
r = 0;
+ else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0)
+ r = 0;
else
r = -ENOTSUPP;
@@ -255,6 +257,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return kvmppc_set_sregs_ivor(vcpu, sregs);
}
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+ return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
+{
+ int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val);
+ return r;
+}
+
struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
{
struct kvmppc_vcpu_e500 *vcpu_e500;
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
new file mode 100644
index 000000000000..f1e27fdc8c2e
--- /dev/null
+++ b/arch/powerpc/kvm/irq.h
@@ -0,0 +1,17 @@
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/kvm_host.h>
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ int ret = 0;
+
+#ifdef CONFIG_KVM_MPIC
+ ret = ret || (kvm->arch.mpic != NULL);
+#endif
+ smp_rmb();
+ return ret;
+}
+
+#endif
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
new file mode 100644
index 000000000000..f3148f8cdc12
--- /dev/null
+++ b/arch/powerpc/kvm/mpic.c
@@ -0,0 +1,1843 @@
+/*
+ * OpenPIC emulation
+ *
+ * Copyright (c) 2004 Jocelyn Mayer
+ * 2011 Alexander Graf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <asm/uaccess.h>
+#include <asm/mpic.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include "iodev.h"
+
+#define MAX_CPU 32
+#define MAX_SRC 256
+#define MAX_TMR 4
+#define MAX_IPI 4
+#define MAX_MSI 8
+#define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR)
+#define VID 0x03 /* MPIC version ID */
+
+/* OpenPIC capability flags */
+#define OPENPIC_FLAG_IDR_CRIT (1 << 0)
+#define OPENPIC_FLAG_ILR (2 << 0)
+
+/* OpenPIC address map */
+#define OPENPIC_REG_SIZE 0x40000
+#define OPENPIC_GLB_REG_START 0x0
+#define OPENPIC_GLB_REG_SIZE 0x10F0
+#define OPENPIC_TMR_REG_START 0x10F0
+#define OPENPIC_TMR_REG_SIZE 0x220
+#define OPENPIC_MSI_REG_START 0x1600
+#define OPENPIC_MSI_REG_SIZE 0x200
+#define OPENPIC_SUMMARY_REG_START 0x3800
+#define OPENPIC_SUMMARY_REG_SIZE 0x800
+#define OPENPIC_SRC_REG_START 0x10000
+#define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20)
+#define OPENPIC_CPU_REG_START 0x20000
+#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000))
+
+struct fsl_mpic_info {
+ int max_ext;
+};
+
+static struct fsl_mpic_info fsl_mpic_20 = {
+ .max_ext = 12,
+};
+
+static struct fsl_mpic_info fsl_mpic_42 = {
+ .max_ext = 12,
+};
+
+#define FRR_NIRQ_SHIFT 16
+#define FRR_NCPU_SHIFT 8
+#define FRR_VID_SHIFT 0
+
+#define VID_REVISION_1_2 2
+#define VID_REVISION_1_3 3
+
+#define VIR_GENERIC 0x00000000 /* Generic Vendor ID */
+
+#define GCR_RESET 0x80000000
+#define GCR_MODE_PASS 0x00000000
+#define GCR_MODE_MIXED 0x20000000
+#define GCR_MODE_PROXY 0x60000000
+
+#define TBCR_CI 0x80000000 /* count inhibit */
+#define TCCR_TOG 0x80000000 /* toggles when decrement to zero */
+
+#define IDR_EP_SHIFT 31
+#define IDR_EP_MASK (1 << IDR_EP_SHIFT)
+#define IDR_CI0_SHIFT 30
+#define IDR_CI1_SHIFT 29
+#define IDR_P1_SHIFT 1
+#define IDR_P0_SHIFT 0
+
+#define ILR_INTTGT_MASK 0x000000ff
+#define ILR_INTTGT_INT 0x00
+#define ILR_INTTGT_CINT 0x01 /* critical */
+#define ILR_INTTGT_MCP 0x02 /* machine check */
+#define NUM_OUTPUTS 3
+
+#define MSIIR_OFFSET 0x140
+#define MSIIR_SRS_SHIFT 29
+#define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT)
+#define MSIIR_IBS_SHIFT 24
+#define MSIIR_IBS_MASK (0x1f << MSIIR_IBS_SHIFT)
+
+static int get_current_cpu(void)
+{
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+ struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
+ return vcpu ? vcpu->arch.irq_cpu_id : -1;
+#else
+ /* XXX */
+ return -1;
+#endif
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+ u32 val, int idx);
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+ u32 *ptr, int idx);
+
+enum irq_type {
+ IRQ_TYPE_NORMAL = 0,
+ IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */
+ IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */
+};
+
+struct irq_queue {
+ /* Round up to the nearest 64 IRQs so that the queue length
+ * won't change when moving between 32 and 64 bit hosts.
+ */
+ unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)];
+ int next;
+ int priority;
+};
+
+struct irq_source {
+ uint32_t ivpr; /* IRQ vector/priority register */
+ uint32_t idr; /* IRQ destination register */
+ uint32_t destmask; /* bitmap of CPU destinations */
+ int last_cpu;
+ int output; /* IRQ level, e.g. ILR_INTTGT_INT */
+ int pending; /* TRUE if IRQ is pending */
+ enum irq_type type;
+ bool level:1; /* level-triggered */
+ bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */
+};
+
+#define IVPR_MASK_SHIFT 31
+#define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT)
+#define IVPR_ACTIVITY_SHIFT 30
+#define IVPR_ACTIVITY_MASK (1 << IVPR_ACTIVITY_SHIFT)
+#define IVPR_MODE_SHIFT 29
+#define IVPR_MODE_MASK (1 << IVPR_MODE_SHIFT)
+#define IVPR_POLARITY_SHIFT 23
+#define IVPR_POLARITY_MASK (1 << IVPR_POLARITY_SHIFT)
+#define IVPR_SENSE_SHIFT 22
+#define IVPR_SENSE_MASK (1 << IVPR_SENSE_SHIFT)
+
+#define IVPR_PRIORITY_MASK (0xF << 16)
+#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16))
+#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask)
+
+/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */
+#define IDR_EP 0x80000000 /* external pin */
+#define IDR_CI 0x40000000 /* critical interrupt */
+
+struct irq_dest {
+ struct kvm_vcpu *vcpu;
+
+ int32_t ctpr; /* CPU current task priority */
+ struct irq_queue raised;
+ struct irq_queue servicing;
+
+ /* Count of IRQ sources asserting on non-INT outputs */
+ uint32_t outputs_active[NUM_OUTPUTS];
+};
+
+struct openpic {
+ struct kvm *kvm;
+ struct kvm_device *dev;
+ struct kvm_io_device mmio;
+ struct list_head mmio_regions;
+ atomic_t users;
+
+ gpa_t reg_base;
+ spinlock_t lock;
+
+ /* Behavior control */
+ struct fsl_mpic_info *fsl;
+ uint32_t model;
+ uint32_t flags;
+ uint32_t nb_irqs;
+ uint32_t vid;
+ uint32_t vir; /* Vendor identification register */
+ uint32_t vector_mask;
+ uint32_t tfrr_reset;
+ uint32_t ivpr_reset;
+ uint32_t idr_reset;
+ uint32_t brr1;
+ uint32_t mpic_mode_mask;
+
+ /* Global registers */
+ uint32_t frr; /* Feature reporting register */
+ uint32_t gcr; /* Global configuration register */
+ uint32_t pir; /* Processor initialization register */
+ uint32_t spve; /* Spurious vector register */
+ uint32_t tfrr; /* Timer frequency reporting register */
+ /* Source registers */
+ struct irq_source src[MAX_IRQ];
+ /* Local registers per output pin */
+ struct irq_dest dst[MAX_CPU];
+ uint32_t nb_cpus;
+ /* Timer registers */
+ struct {
+ uint32_t tccr; /* Global timer current count register */
+ uint32_t tbcr; /* Global timer base count register */
+ } timers[MAX_TMR];
+ /* Shared MSI registers */
+ struct {
+ uint32_t msir; /* Shared Message Signaled Interrupt Register */
+ } msi[MAX_MSI];
+ uint32_t max_irq;
+ uint32_t irq_ipi0;
+ uint32_t irq_tim0;
+ uint32_t irq_msi;
+};
+
+
+static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
+ int output)
+{
+ struct kvm_interrupt irq = {
+ .irq = KVM_INTERRUPT_SET_LEVEL,
+ };
+
+ if (!dst->vcpu) {
+ pr_debug("%s: destination cpu %d does not exist\n",
+ __func__, (int)(dst - &opp->dst[0]));
+ return;
+ }
+
+ pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+ output);
+
+ if (output != ILR_INTTGT_INT) /* TODO */
+ return;
+
+ kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
+}
+
+static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
+ int output)
+{
+ if (!dst->vcpu) {
+ pr_debug("%s: destination cpu %d does not exist\n",
+ __func__, (int)(dst - &opp->dst[0]));
+ return;
+ }
+
+ pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+ output);
+
+ if (output != ILR_INTTGT_INT) /* TODO */
+ return;
+
+ kvmppc_core_dequeue_external(dst->vcpu);
+}
+
+static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
+{
+ set_bit(n_IRQ, q->queue);
+}
+
+static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ)
+{
+ clear_bit(n_IRQ, q->queue);
+}
+
+static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ)
+{
+ return test_bit(n_IRQ, q->queue);
+}
+
+static void IRQ_check(struct openpic *opp, struct irq_queue *q)
+{
+ int irq = -1;
+ int next = -1;
+ int priority = -1;
+
+ for (;;) {
+ irq = find_next_bit(q->queue, opp->max_irq, irq + 1);
+ if (irq == opp->max_irq)
+ break;
+
+ pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n",
+ irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority);
+
+ if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) {
+ next = irq;
+ priority = IVPR_PRIORITY(opp->src[irq].ivpr);
+ }
+ }
+
+ q->next = next;
+ q->priority = priority;
+}
+
+static int IRQ_get_next(struct openpic *opp, struct irq_queue *q)
+{
+ /* XXX: optimize */
+ IRQ_check(opp, q);
+
+ return q->next;
+}
+
+static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
+ bool active, bool was_active)
+{
+ struct irq_dest *dst;
+ struct irq_source *src;
+ int priority;
+
+ dst = &opp->dst[n_CPU];
+ src = &opp->src[n_IRQ];
+
+ pr_debug("%s: IRQ %d active %d was %d\n",
+ __func__, n_IRQ, active, was_active);
+
+ if (src->output != ILR_INTTGT_INT) {
+ pr_debug("%s: output %d irq %d active %d was %d count %d\n",
+ __func__, src->output, n_IRQ, active, was_active,
+ dst->outputs_active[src->output]);
+
+ /* On Freescale MPIC, critical interrupts ignore priority,
+ * IACK, EOI, etc. Before MPIC v4.1 they also ignore
+ * masking.
+ */
+ if (active) {
+ if (!was_active &&
+ dst->outputs_active[src->output]++ == 0) {
+ pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
+ __func__, src->output, n_CPU, n_IRQ);
+ mpic_irq_raise(opp, dst, src->output);
+ }
+ } else {
+ if (was_active &&
+ --dst->outputs_active[src->output] == 0) {
+ pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
+ __func__, src->output, n_CPU, n_IRQ);
+ mpic_irq_lower(opp, dst, src->output);
+ }
+ }
+
+ return;
+ }
+
+ priority = IVPR_PRIORITY(src->ivpr);
+
+ /* Even if the interrupt doesn't have enough priority,
+ * it is still raised, in case ctpr is lowered later.
+ */
+ if (active)
+ IRQ_setbit(&dst->raised, n_IRQ);
+ else
+ IRQ_resetbit(&dst->raised, n_IRQ);
+
+ IRQ_check(opp, &dst->raised);
+
+ if (active && priority <= dst->ctpr) {
+ pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n",
+ __func__, n_IRQ, priority, dst->ctpr, n_CPU);
+ active = 0;
+ }
+
+ if (active) {
+ if (IRQ_get_next(opp, &dst->servicing) >= 0 &&
+ priority <= dst->servicing.priority) {
+ pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n",
+ __func__, n_IRQ, dst->servicing.next, n_CPU);
+ } else {
+ pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
+ __func__, n_CPU, n_IRQ, dst->raised.next);
+ mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+ }
+ } else {
+ IRQ_get_next(opp, &dst->servicing);
+ if (dst->raised.priority > dst->ctpr &&
+ dst->raised.priority > dst->servicing.priority) {
+ pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n",
+ __func__, n_IRQ, dst->raised.next,
+ dst->raised.priority, dst->ctpr,
+ dst->servicing.priority, n_CPU);
+ /* IRQ line stays asserted */
+ } else {
+ pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
+ __func__, n_IRQ, dst->ctpr,
+ dst->servicing.priority, n_CPU);
+ mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+ }
+ }
+}
+
+/* update pic state because registers for n_IRQ have changed value */
+static void openpic_update_irq(struct openpic *opp, int n_IRQ)
+{
+ struct irq_source *src;
+ bool active, was_active;
+ int i;
+
+ src = &opp->src[n_IRQ];
+ active = src->pending;
+
+ if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) {
+ /* Interrupt source is disabled */
+ pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ);
+ active = false;
+ }
+
+ was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK);
+
+ /*
+ * We don't have a similar check for already-active because
+ * ctpr may have changed and we need to withdraw the interrupt.
+ */
+ if (!active && !was_active) {
+ pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ);
+ return;
+ }
+
+ if (active)
+ src->ivpr |= IVPR_ACTIVITY_MASK;
+ else
+ src->ivpr &= ~IVPR_ACTIVITY_MASK;
+
+ if (src->destmask == 0) {
+ /* No target */
+ pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ);
+ return;
+ }
+
+ if (src->destmask == (1 << src->last_cpu)) {
+ /* Only one CPU is allowed to receive this IRQ */
+ IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active);
+ } else if (!(src->ivpr & IVPR_MODE_MASK)) {
+ /* Directed delivery mode */
+ for (i = 0; i < opp->nb_cpus; i++) {
+ if (src->destmask & (1 << i)) {
+ IRQ_local_pipe(opp, i, n_IRQ, active,
+ was_active);
+ }
+ }
+ } else {
+ /* Distributed delivery mode */
+ for (i = src->last_cpu + 1; i != src->last_cpu; i++) {
+ if (i == opp->nb_cpus)
+ i = 0;
+
+ if (src->destmask & (1 << i)) {
+ IRQ_local_pipe(opp, i, n_IRQ, active,
+ was_active);
+ src->last_cpu = i;
+ break;
+ }
+ }
+ }
+}
+
+static void openpic_set_irq(void *opaque, int n_IRQ, int level)
+{
+ struct openpic *opp = opaque;
+ struct irq_source *src;
+
+ if (n_IRQ >= MAX_IRQ) {
+ WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
+ return;
+ }
+
+ src = &opp->src[n_IRQ];
+ pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n",
+ n_IRQ, level, src->ivpr);
+ if (src->level) {
+ /* level-sensitive irq */
+ src->pending = level;
+ openpic_update_irq(opp, n_IRQ);
+ } else {
+ /* edge-sensitive irq */
+ if (level) {
+ src->pending = 1;
+ openpic_update_irq(opp, n_IRQ);
+ }
+
+ if (src->output != ILR_INTTGT_INT) {
+ /* Edge-triggered interrupts shouldn't be used
+ * with non-INT delivery, but just in case,
+ * try to make it do something sane rather than
+ * cause an interrupt storm. This is close to
+ * what you'd probably see happen in real hardware.
+ */
+ src->pending = 0;
+ openpic_update_irq(opp, n_IRQ);
+ }
+ }
+}
+
+static void openpic_reset(struct openpic *opp)
+{
+ int i;
+
+ opp->gcr = GCR_RESET;
+ /* Initialise controller registers */
+ opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
+ (opp->vid << FRR_VID_SHIFT);
+
+ opp->pir = 0;
+ opp->spve = -1 & opp->vector_mask;
+ opp->tfrr = opp->tfrr_reset;
+ /* Initialise IRQ sources */
+ for (i = 0; i < opp->max_irq; i++) {
+ opp->src[i].ivpr = opp->ivpr_reset;
+ opp->src[i].idr = opp->idr_reset;
+
+ switch (opp->src[i].type) {
+ case IRQ_TYPE_NORMAL:
+ opp->src[i].level =
+ !!(opp->ivpr_reset & IVPR_SENSE_MASK);
+ break;
+
+ case IRQ_TYPE_FSLINT:
+ opp->src[i].ivpr |= IVPR_POLARITY_MASK;
+ break;
+
+ case IRQ_TYPE_FSLSPECIAL:
+ break;
+ }
+ }
+ /* Initialise IRQ destinations */
+ for (i = 0; i < MAX_CPU; i++) {
+ opp->dst[i].ctpr = 15;
+ memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue));
+ opp->dst[i].raised.next = -1;
+ memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue));
+ opp->dst[i].servicing.next = -1;
+ }
+ /* Initialise timers */
+ for (i = 0; i < MAX_TMR; i++) {
+ opp->timers[i].tccr = 0;
+ opp->timers[i].tbcr = TBCR_CI;
+ }
+ /* Go out of RESET state */
+ opp->gcr = 0;
+}
+
+static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
+{
+ return opp->src[n_IRQ].idr;
+}
+
+static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
+{
+ if (opp->flags & OPENPIC_FLAG_ILR)
+ return opp->src[n_IRQ].output;
+
+ return 0xffffffff;
+}
+
+static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ)
+{
+ return opp->src[n_IRQ].ivpr;
+}
+
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+ uint32_t val)
+{
+ struct irq_source *src = &opp->src[n_IRQ];
+ uint32_t normal_mask = (1UL << opp->nb_cpus) - 1;
+ uint32_t crit_mask = 0;
+ uint32_t mask = normal_mask;
+ int crit_shift = IDR_EP_SHIFT - opp->nb_cpus;
+ int i;
+
+ if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+ crit_mask = mask << crit_shift;
+ mask |= crit_mask | IDR_EP;
+ }
+
+ src->idr = val & mask;
+ pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr);
+
+ if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+ if (src->idr & crit_mask) {
+ if (src->idr & normal_mask) {
+ pr_debug("%s: IRQ configured for multiple output types, using critical\n",
+ __func__);
+ }
+
+ src->output = ILR_INTTGT_CINT;
+ src->nomask = true;
+ src->destmask = 0;
+
+ for (i = 0; i < opp->nb_cpus; i++) {
+ int n_ci = IDR_CI0_SHIFT - i;
+
+ if (src->idr & (1UL << n_ci))
+ src->destmask |= 1UL << i;
+ }
+ } else {
+ src->output = ILR_INTTGT_INT;
+ src->nomask = false;
+ src->destmask = src->idr & normal_mask;
+ }
+ } else {
+ src->destmask = src->idr;
+ }
+}
+
+static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
+ uint32_t val)
+{
+ if (opp->flags & OPENPIC_FLAG_ILR) {
+ struct irq_source *src = &opp->src[n_IRQ];
+
+ src->output = val & ILR_INTTGT_MASK;
+ pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
+ src->output);
+
+ /* TODO: on MPIC v4.0 only, set nomask for non-INT */
+ }
+}
+
+static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
+ uint32_t val)
+{
+ uint32_t mask;
+
+ /* NOTE when implementing newer FSL MPIC models: starting with v4.0,
+ * the polarity bit is read-only on internal interrupts.
+ */
+ mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK |
+ IVPR_POLARITY_MASK | opp->vector_mask;
+
+ /* ACTIVITY bit is read-only */
+ opp->src[n_IRQ].ivpr =
+ (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask);
+
+ /* For FSL internal interrupts, The sense bit is reserved and zero,
+ * and the interrupt is always level-triggered. Timers and IPIs
+ * have no sense or polarity bits, and are edge-triggered.
+ */
+ switch (opp->src[n_IRQ].type) {
+ case IRQ_TYPE_NORMAL:
+ opp->src[n_IRQ].level =
+ !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK);
+ break;
+
+ case IRQ_TYPE_FSLINT:
+ opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK;
+ break;
+
+ case IRQ_TYPE_FSLSPECIAL:
+ opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK);
+ break;
+ }
+
+ openpic_update_irq(opp, n_IRQ);
+ pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val,
+ opp->src[n_IRQ].ivpr);
+}
+
+static void openpic_gcr_write(struct openpic *opp, uint64_t val)
+{
+ if (val & GCR_RESET) {
+ openpic_reset(opp);
+ return;
+ }
+
+ opp->gcr &= ~opp->mpic_mode_mask;
+ opp->gcr |= val & opp->mpic_mode_mask;
+}
+
+static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
+{
+ struct openpic *opp = opaque;
+ int err = 0;
+
+ pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+ if (addr & 0xF)
+ return 0;
+
+ switch (addr) {
+ case 0x00: /* Block Revision Register1 (BRR1) is Readonly */
+ break;
+ case 0x40:
+ case 0x50:
+ case 0x60:
+ case 0x70:
+ case 0x80:
+ case 0x90:
+ case 0xA0:
+ case 0xB0:
+ err = openpic_cpu_write_internal(opp, addr, val,
+ get_current_cpu());
+ break;
+ case 0x1000: /* FRR */
+ break;
+ case 0x1020: /* GCR */
+ openpic_gcr_write(opp, val);
+ break;
+ case 0x1080: /* VIR */
+ break;
+ case 0x1090: /* PIR */
+ /*
+ * This register is used to reset a CPU core --
+ * let userspace handle it.
+ */
+ err = -ENXIO;
+ break;
+ case 0x10A0: /* IPI_IVPR */
+ case 0x10B0:
+ case 0x10C0:
+ case 0x10D0: {
+ int idx;
+ idx = (addr - 0x10A0) >> 4;
+ write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val);
+ break;
+ }
+ case 0x10E0: /* SPVE */
+ opp->spve = val & opp->vector_mask;
+ break;
+ default:
+ break;
+ }
+
+ return err;
+}
+
+static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+ struct openpic *opp = opaque;
+ u32 retval;
+ int err = 0;
+
+ pr_debug("%s: addr %#llx\n", __func__, addr);
+ retval = 0xFFFFFFFF;
+ if (addr & 0xF)
+ goto out;
+
+ switch (addr) {
+ case 0x1000: /* FRR */
+ retval = opp->frr;
+ retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
+ break;
+ case 0x1020: /* GCR */
+ retval = opp->gcr;
+ break;
+ case 0x1080: /* VIR */
+ retval = opp->vir;
+ break;
+ case 0x1090: /* PIR */
+ retval = 0x00000000;
+ break;
+ case 0x00: /* Block Revision Register1 (BRR1) */
+ retval = opp->brr1;
+ break;
+ case 0x40:
+ case 0x50:
+ case 0x60:
+ case 0x70:
+ case 0x80:
+ case 0x90:
+ case 0xA0:
+ case 0xB0:
+ err = openpic_cpu_read_internal(opp, addr,
+ &retval, get_current_cpu());
+ break;
+ case 0x10A0: /* IPI_IVPR */
+ case 0x10B0:
+ case 0x10C0:
+ case 0x10D0:
+ {
+ int idx;
+ idx = (addr - 0x10A0) >> 4;
+ retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx);
+ }
+ break;
+ case 0x10E0: /* SPVE */
+ retval = opp->spve;
+ break;
+ default:
+ break;
+ }
+
+out:
+ pr_debug("%s: => 0x%08x\n", __func__, retval);
+ *ptr = retval;
+ return err;
+}
+
+static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
+{
+ struct openpic *opp = opaque;
+ int idx;
+
+ addr += 0x10f0;
+
+ pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+ if (addr & 0xF)
+ return 0;
+
+ if (addr == 0x10f0) {
+ /* TFRR */
+ opp->tfrr = val;
+ return 0;
+ }
+
+ idx = (addr >> 6) & 0x3;
+ addr = addr & 0x30;
+
+ switch (addr & 0x30) {
+ case 0x00: /* TCCR */
+ break;
+ case 0x10: /* TBCR */
+ if ((opp->timers[idx].tccr & TCCR_TOG) != 0 &&
+ (val & TBCR_CI) == 0 &&
+ (opp->timers[idx].tbcr & TBCR_CI) != 0)
+ opp->timers[idx].tccr &= ~TCCR_TOG;
+
+ opp->timers[idx].tbcr = val;
+ break;
+ case 0x20: /* TVPR */
+ write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val);
+ break;
+ case 0x30: /* TDR */
+ write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
+ break;
+ }
+
+ return 0;
+}
+
+static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+ struct openpic *opp = opaque;
+ uint32_t retval = -1;
+ int idx;
+
+ pr_debug("%s: addr %#llx\n", __func__, addr);
+ if (addr & 0xF)
+ goto out;
+
+ idx = (addr >> 6) & 0x3;
+ if (addr == 0x0) {
+ /* TFRR */
+ retval = opp->tfrr;
+ goto out;
+ }
+
+ switch (addr & 0x30) {
+ case 0x00: /* TCCR */
+ retval = opp->timers[idx].tccr;
+ break;
+ case 0x10: /* TBCR */
+ retval = opp->timers[idx].tbcr;
+ break;
+ case 0x20: /* TIPV */
+ retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx);
+ break;
+ case 0x30: /* TIDE (TIDR) */
+ retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx);
+ break;
+ }
+
+out:
+ pr_debug("%s: => 0x%08x\n", __func__, retval);
+ *ptr = retval;
+ return 0;
+}
+
+static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
+{
+ struct openpic *opp = opaque;
+ int idx;
+
+ pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+
+ addr = addr & 0xffff;
+ idx = addr >> 5;
+
+ switch (addr & 0x1f) {
+ case 0x00:
+ write_IRQreg_ivpr(opp, idx, val);
+ break;
+ case 0x10:
+ write_IRQreg_idr(opp, idx, val);
+ break;
+ case 0x18:
+ write_IRQreg_ilr(opp, idx, val);
+ break;
+ }
+
+ return 0;
+}
+
+static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+ struct openpic *opp = opaque;
+ uint32_t retval;
+ int idx;
+
+ pr_debug("%s: addr %#llx\n", __func__, addr);
+ retval = 0xFFFFFFFF;
+
+ addr = addr & 0xffff;
+ idx = addr >> 5;
+
+ switch (addr & 0x1f) {
+ case 0x00:
+ retval = read_IRQreg_ivpr(opp, idx);
+ break;
+ case 0x10:
+ retval = read_IRQreg_idr(opp, idx);
+ break;
+ case 0x18:
+ retval = read_IRQreg_ilr(opp, idx);
+ break;
+ }
+
+ pr_debug("%s: => 0x%08x\n", __func__, retval);
+ *ptr = retval;
+ return 0;
+}
+
+static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
+{
+ struct openpic *opp = opaque;
+ int idx = opp->irq_msi;
+ int srs, ibs;
+
+ pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+ if (addr & 0xF)
+ return 0;
+
+ switch (addr) {
+ case MSIIR_OFFSET:
+ srs = val >> MSIIR_SRS_SHIFT;
+ idx += srs;
+ ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT;
+ opp->msi[srs].msir |= 1 << ibs;
+ openpic_set_irq(opp, idx, 1);
+ break;
+ default:
+ /* most registers are read-only, thus ignored */
+ break;
+ }
+
+ return 0;
+}
+
+static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+ struct openpic *opp = opaque;
+ uint32_t r = 0;
+ int i, srs;
+
+ pr_debug("%s: addr %#llx\n", __func__, addr);
+ if (addr & 0xF)
+ return -ENXIO;
+
+ srs = addr >> 4;
+
+ switch (addr) {
+ case 0x00:
+ case 0x10:
+ case 0x20:
+ case 0x30:
+ case 0x40:
+ case 0x50:
+ case 0x60:
+ case 0x70: /* MSIRs */
+ r = opp->msi[srs].msir;
+ /* Clear on read */
+ opp->msi[srs].msir = 0;
+ openpic_set_irq(opp, opp->irq_msi + srs, 0);
+ break;
+ case 0x120: /* MSISR */
+ for (i = 0; i < MAX_MSI; i++)
+ r |= (opp->msi[i].msir ? 1 : 0) << i;
+ break;
+ }
+
+ pr_debug("%s: => 0x%08x\n", __func__, r);
+ *ptr = r;
+ return 0;
+}
+
+static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+ uint32_t r = 0;
+
+ pr_debug("%s: addr %#llx\n", __func__, addr);
+
+ /* TODO: EISR/EIMR */
+
+ *ptr = r;
+ return 0;
+}
+
+static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
+{
+ pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+
+ /* TODO: EISR/EIMR */
+ return 0;
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+ u32 val, int idx)
+{
+ struct openpic *opp = opaque;
+ struct irq_source *src;
+ struct irq_dest *dst;
+ int s_IRQ, n_IRQ;
+
+ pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
+ addr, val);
+
+ if (idx < 0)
+ return 0;
+
+ if (addr & 0xF)
+ return 0;
+
+ dst = &opp->dst[idx];
+ addr &= 0xFF0;
+ switch (addr) {
+ case 0x40: /* IPIDR */
+ case 0x50:
+ case 0x60:
+ case 0x70:
+ idx = (addr - 0x40) >> 4;
+ /* we use IDE as mask which CPUs to deliver the IPI to still. */
+ opp->src[opp->irq_ipi0 + idx].destmask |= val;
+ openpic_set_irq(opp, opp->irq_ipi0 + idx, 1);
+ openpic_set_irq(opp, opp->irq_ipi0 + idx, 0);
+ break;
+ case 0x80: /* CTPR */
+ dst->ctpr = val & 0x0000000F;
+
+ pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n",
+ __func__, idx, dst->ctpr, dst->raised.priority,
+ dst->servicing.priority);
+
+ if (dst->raised.priority <= dst->ctpr) {
+ pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
+ __func__, idx);
+ mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+ } else if (dst->raised.priority > dst->servicing.priority) {
+ pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
+ __func__, idx, dst->raised.next);
+ mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+ }
+
+ break;
+ case 0x90: /* WHOAMI */
+ /* Read-only register */
+ break;
+ case 0xA0: /* IACK */
+ /* Read-only register */
+ break;
+ case 0xB0: { /* EOI */
+ int notify_eoi;
+
+ pr_debug("EOI\n");
+ s_IRQ = IRQ_get_next(opp, &dst->servicing);
+
+ if (s_IRQ < 0) {
+ pr_debug("%s: EOI with no interrupt in service\n",
+ __func__);
+ break;
+ }
+
+ IRQ_resetbit(&dst->servicing, s_IRQ);
+ /* Notify listeners that the IRQ is over */
+ notify_eoi = s_IRQ;
+ /* Set up next servicing IRQ */
+ s_IRQ = IRQ_get_next(opp, &dst->servicing);
+ /* Check queued interrupts. */
+ n_IRQ = IRQ_get_next(opp, &dst->raised);
+ src = &opp->src[n_IRQ];
+ if (n_IRQ != -1 &&
+ (s_IRQ == -1 ||
+ IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
+ pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
+ idx, n_IRQ);
+ mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+ }
+
+ spin_unlock(&opp->lock);
+ kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
+ spin_lock(&opp->lock);
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
+{
+ struct openpic *opp = opaque;
+
+ return openpic_cpu_write_internal(opp, addr, val,
+ (addr & 0x1f000) >> 12);
+}
+
+static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
+ int cpu)
+{
+ struct irq_source *src;
+ int retval, irq;
+
+ pr_debug("Lower OpenPIC INT output\n");
+ mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+
+ irq = IRQ_get_next(opp, &dst->raised);
+ pr_debug("IACK: irq=%d\n", irq);
+
+ if (irq == -1)
+ /* No more interrupt pending */
+ return opp->spve;
+
+ src = &opp->src[irq];
+ if (!(src->ivpr & IVPR_ACTIVITY_MASK) ||
+ !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) {
+ pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n",
+ __func__, irq, dst->ctpr, src->ivpr);
+ openpic_update_irq(opp, irq);
+ retval = opp->spve;
+ } else {
+ /* IRQ enter servicing state */
+ IRQ_setbit(&dst->servicing, irq);
+ retval = IVPR_VECTOR(opp, src->ivpr);
+ }
+
+ if (!src->level) {
+ /* edge-sensitive IRQ */
+ src->ivpr &= ~IVPR_ACTIVITY_MASK;
+ src->pending = 0;
+ IRQ_resetbit(&dst->raised, irq);
+ }
+
+ if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) {
+ src->destmask &= ~(1 << cpu);
+ if (src->destmask && !src->level) {
+ /* trigger on CPUs that didn't know about it yet */
+ openpic_set_irq(opp, irq, 1);
+ openpic_set_irq(opp, irq, 0);
+ /* if all CPUs knew about it, set active bit again */
+ src->ivpr |= IVPR_ACTIVITY_MASK;
+ }
+ }
+
+ return retval;
+}
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+ struct openpic *opp = vcpu->arch.mpic;
+ int cpu = vcpu->arch.irq_cpu_id;
+ unsigned long flags;
+
+ spin_lock_irqsave(&opp->lock, flags);
+
+ if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
+ kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
+
+ spin_unlock_irqrestore(&opp->lock, flags);
+}
+
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+ u32 *ptr, int idx)
+{
+ struct openpic *opp = opaque;
+ struct irq_dest *dst;
+ uint32_t retval;
+
+ pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
+ retval = 0xFFFFFFFF;
+
+ if (idx < 0)
+ goto out;
+
+ if (addr & 0xF)
+ goto out;
+
+ dst = &opp->dst[idx];
+ addr &= 0xFF0;
+ switch (addr) {
+ case 0x80: /* CTPR */
+ retval = dst->ctpr;
+ break;
+ case 0x90: /* WHOAMI */
+ retval = idx;
+ break;
+ case 0xA0: /* IACK */
+ retval = openpic_iack(opp, dst, idx);
+ break;
+ case 0xB0: /* EOI */
+ retval = 0;
+ break;
+ default:
+ break;
+ }
+ pr_debug("%s: => 0x%08x\n", __func__, retval);
+
+out:
+ *ptr = retval;
+ return 0;
+}
+
+static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+ struct openpic *opp = opaque;
+
+ return openpic_cpu_read_internal(opp, addr, ptr,
+ (addr & 0x1f000) >> 12);
+}
+
+struct mem_reg {
+ struct list_head list;
+ int (*read)(void *opaque, gpa_t addr, u32 *ptr);
+ int (*write)(void *opaque, gpa_t addr, u32 val);
+ gpa_t start_addr;
+ int size;
+};
+
+static struct mem_reg openpic_gbl_mmio = {
+ .write = openpic_gbl_write,
+ .read = openpic_gbl_read,
+ .start_addr = OPENPIC_GLB_REG_START,
+ .size = OPENPIC_GLB_REG_SIZE,
+};
+
+static struct mem_reg openpic_tmr_mmio = {
+ .write = openpic_tmr_write,
+ .read = openpic_tmr_read,
+ .start_addr = OPENPIC_TMR_REG_START,
+ .size = OPENPIC_TMR_REG_SIZE,
+};
+
+static struct mem_reg openpic_cpu_mmio = {
+ .write = openpic_cpu_write,
+ .read = openpic_cpu_read,
+ .start_addr = OPENPIC_CPU_REG_START,
+ .size = OPENPIC_CPU_REG_SIZE,
+};
+
+static struct mem_reg openpic_src_mmio = {
+ .write = openpic_src_write,
+ .read = openpic_src_read,
+ .start_addr = OPENPIC_SRC_REG_START,
+ .size = OPENPIC_SRC_REG_SIZE,
+};
+
+static struct mem_reg openpic_msi_mmio = {
+ .read = openpic_msi_read,
+ .write = openpic_msi_write,
+ .start_addr = OPENPIC_MSI_REG_START,
+ .size = OPENPIC_MSI_REG_SIZE,
+};
+
+static struct mem_reg openpic_summary_mmio = {
+ .read = openpic_summary_read,
+ .write = openpic_summary_write,
+ .start_addr = OPENPIC_SUMMARY_REG_START,
+ .size = OPENPIC_SUMMARY_REG_SIZE,
+};
+
+static void fsl_common_init(struct openpic *opp)
+{
+ int i;
+ int virq = MAX_SRC;
+
+ list_add(&openpic_msi_mmio.list, &opp->mmio_regions);
+ list_add(&openpic_summary_mmio.list, &opp->mmio_regions);
+
+ opp->vid = VID_REVISION_1_2;
+ opp->vir = VIR_GENERIC;
+ opp->vector_mask = 0xFFFF;
+ opp->tfrr_reset = 0;
+ opp->ivpr_reset = IVPR_MASK_MASK;
+ opp->idr_reset = 1 << 0;
+ opp->max_irq = MAX_IRQ;
+
+ opp->irq_ipi0 = virq;
+ virq += MAX_IPI;
+ opp->irq_tim0 = virq;
+ virq += MAX_TMR;
+
+ BUG_ON(virq > MAX_IRQ);
+
+ opp->irq_msi = 224;
+
+ for (i = 0; i < opp->fsl->max_ext; i++)
+ opp->src[i].level = false;
+
+ /* Internal interrupts, including message and MSI */
+ for (i = 16; i < MAX_SRC; i++) {
+ opp->src[i].type = IRQ_TYPE_FSLINT;
+ opp->src[i].level = true;
+ }
+
+ /* timers and IPIs */
+ for (i = MAX_SRC; i < virq; i++) {
+ opp->src[i].type = IRQ_TYPE_FSLSPECIAL;
+ opp->src[i].level = false;
+ }
+}
+
+static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
+{
+ struct list_head *node;
+
+ list_for_each(node, &opp->mmio_regions) {
+ struct mem_reg *mr = list_entry(node, struct mem_reg, list);
+
+ if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+ continue;
+
+ return mr->read(opp, addr - mr->start_addr, ptr);
+ }
+
+ return -ENXIO;
+}
+
+static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
+{
+ struct list_head *node;
+
+ list_for_each(node, &opp->mmio_regions) {
+ struct mem_reg *mr = list_entry(node, struct mem_reg, list);
+
+ if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+ continue;
+
+ return mr->write(opp, addr - mr->start_addr, val);
+ }
+
+ return -ENXIO;
+}
+
+static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
+ int len, void *ptr)
+{
+ struct openpic *opp = container_of(this, struct openpic, mmio);
+ int ret;
+ union {
+ u32 val;
+ u8 bytes[4];
+ } u;
+
+ if (addr & (len - 1)) {
+ pr_debug("%s: bad alignment %llx/%d\n",
+ __func__, addr, len);
+ return -EINVAL;
+ }
+
+ spin_lock_irq(&opp->lock);
+ ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
+ spin_unlock_irq(&opp->lock);
+
+ /*
+ * Technically only 32-bit accesses are allowed, but be nice to
+ * people dumping registers a byte at a time -- it works in real
+ * hardware (reads only, not writes).
+ */
+ if (len == 4) {
+ *(u32 *)ptr = u.val;
+ pr_debug("%s: addr %llx ret %d len 4 val %x\n",
+ __func__, addr, ret, u.val);
+ } else if (len == 1) {
+ *(u8 *)ptr = u.bytes[addr & 3];
+ pr_debug("%s: addr %llx ret %d len 1 val %x\n",
+ __func__, addr, ret, u.bytes[addr & 3]);
+ } else {
+ pr_debug("%s: bad length %d\n", __func__, len);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
+ int len, const void *ptr)
+{
+ struct openpic *opp = container_of(this, struct openpic, mmio);
+ int ret;
+
+ if (len != 4) {
+ pr_debug("%s: bad length %d\n", __func__, len);
+ return -EOPNOTSUPP;
+ }
+ if (addr & 3) {
+ pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
+ return -EOPNOTSUPP;
+ }
+
+ spin_lock_irq(&opp->lock);
+ ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
+ *(const u32 *)ptr);
+ spin_unlock_irq(&opp->lock);
+
+ pr_debug("%s: addr %llx ret %d val %x\n",
+ __func__, addr, ret, *(const u32 *)ptr);
+
+ return ret;
+}
+
+static const struct kvm_io_device_ops mpic_mmio_ops = {
+ .read = kvm_mpic_read,
+ .write = kvm_mpic_write,
+};
+
+static void map_mmio(struct openpic *opp)
+{
+ kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
+
+ kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
+ opp->reg_base, OPENPIC_REG_SIZE,
+ &opp->mmio);
+}
+
+static void unmap_mmio(struct openpic *opp)
+{
+ kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
+}
+
+static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+ u64 base;
+
+ if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64)))
+ return -EFAULT;
+
+ if (base & 0x3ffff) {
+ pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
+ __func__, base);
+ return -EINVAL;
+ }
+
+ if (base == opp->reg_base)
+ return 0;
+
+ mutex_lock(&opp->kvm->slots_lock);
+
+ unmap_mmio(opp);
+ opp->reg_base = base;
+
+ pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
+ __func__, base);
+
+ if (base == 0)
+ goto out;
+
+ map_mmio(opp);
+
+ mutex_unlock(&opp->kvm->slots_lock);
+out:
+ return 0;
+}
+
+#define ATTR_SET 0
+#define ATTR_GET 1
+
+static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
+{
+ int ret;
+
+ if (addr & 3)
+ return -ENXIO;
+
+ spin_lock_irq(&opp->lock);
+
+ if (type == ATTR_SET)
+ ret = kvm_mpic_write_internal(opp, addr, *val);
+ else
+ ret = kvm_mpic_read_internal(opp, addr, val);
+
+ spin_unlock_irq(&opp->lock);
+
+ pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
+
+ return ret;
+}
+
+static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ struct openpic *opp = dev->private;
+ u32 attr32;
+
+ switch (attr->group) {
+ case KVM_DEV_MPIC_GRP_MISC:
+ switch (attr->attr) {
+ case KVM_DEV_MPIC_BASE_ADDR:
+ return set_base_addr(opp, attr);
+ }
+
+ break;
+
+ case KVM_DEV_MPIC_GRP_REGISTER:
+ if (get_user(attr32, (u32 __user *)(long)attr->addr))
+ return -EFAULT;
+
+ return access_reg(opp, attr->attr, &attr32, ATTR_SET);
+
+ case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+ if (attr->attr > MAX_SRC)
+ return -EINVAL;
+
+ if (get_user(attr32, (u32 __user *)(long)attr->addr))
+ return -EFAULT;
+
+ if (attr32 != 0 && attr32 != 1)
+ return -EINVAL;
+
+ spin_lock_irq(&opp->lock);
+ openpic_set_irq(opp, attr->attr, attr32);
+ spin_unlock_irq(&opp->lock);
+ return 0;
+ }
+
+ return -ENXIO;
+}
+
+static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ struct openpic *opp = dev->private;
+ u64 attr64;
+ u32 attr32;
+ int ret;
+
+ switch (attr->group) {
+ case KVM_DEV_MPIC_GRP_MISC:
+ switch (attr->attr) {
+ case KVM_DEV_MPIC_BASE_ADDR:
+ mutex_lock(&opp->kvm->slots_lock);
+ attr64 = opp->reg_base;
+ mutex_unlock(&opp->kvm->slots_lock);
+
+ if (copy_to_user((u64 __user *)(long)attr->addr,
+ &attr64, sizeof(u64)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ break;
+
+ case KVM_DEV_MPIC_GRP_REGISTER:
+ ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
+ if (ret)
+ return ret;
+
+ if (put_user(attr32, (u32 __user *)(long)attr->addr))
+ return -EFAULT;
+
+ return 0;
+
+ case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+ if (attr->attr > MAX_SRC)
+ return -EINVAL;
+
+ spin_lock_irq(&opp->lock);
+ attr32 = opp->src[attr->attr].pending;
+ spin_unlock_irq(&opp->lock);
+
+ if (put_user(attr32, (u32 __user *)(long)attr->addr))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ return -ENXIO;
+}
+
+static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_MPIC_GRP_MISC:
+ switch (attr->attr) {
+ case KVM_DEV_MPIC_BASE_ADDR:
+ return 0;
+ }
+
+ break;
+
+ case KVM_DEV_MPIC_GRP_REGISTER:
+ return 0;
+
+ case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+ if (attr->attr > MAX_SRC)
+ break;
+
+ return 0;
+ }
+
+ return -ENXIO;
+}
+
+static void mpic_destroy(struct kvm_device *dev)
+{
+ struct openpic *opp = dev->private;
+
+ dev->kvm->arch.mpic = NULL;
+ kfree(opp);
+}
+
+static int mpic_set_default_irq_routing(struct openpic *opp)
+{
+ struct kvm_irq_routing_entry *routing;
+
+ /* Create a nop default map, so that dereferencing it still works */
+ routing = kzalloc((sizeof(*routing)), GFP_KERNEL);
+ if (!routing)
+ return -ENOMEM;
+
+ kvm_set_irq_routing(opp->kvm, routing, 0, 0);
+
+ kfree(routing);
+ return 0;
+}
+
+static int mpic_create(struct kvm_device *dev, u32 type)
+{
+ struct openpic *opp;
+ int ret;
+
+ /* We only support one MPIC at a time for now */
+ if (dev->kvm->arch.mpic)
+ return -EINVAL;
+
+ opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
+ if (!opp)
+ return -ENOMEM;
+
+ dev->private = opp;
+ opp->kvm = dev->kvm;
+ opp->dev = dev;
+ opp->model = type;
+ spin_lock_init(&opp->lock);
+
+ INIT_LIST_HEAD(&opp->mmio_regions);
+ list_add(&openpic_gbl_mmio.list, &opp->mmio_regions);
+ list_add(&openpic_tmr_mmio.list, &opp->mmio_regions);
+ list_add(&openpic_src_mmio.list, &opp->mmio_regions);
+ list_add(&openpic_cpu_mmio.list, &opp->mmio_regions);
+
+ switch (opp->model) {
+ case KVM_DEV_TYPE_FSL_MPIC_20:
+ opp->fsl = &fsl_mpic_20;
+ opp->brr1 = 0x00400200;
+ opp->flags |= OPENPIC_FLAG_IDR_CRIT;
+ opp->nb_irqs = 80;
+ opp->mpic_mode_mask = GCR_MODE_MIXED;
+
+ fsl_common_init(opp);
+
+ break;
+
+ case KVM_DEV_TYPE_FSL_MPIC_42:
+ opp->fsl = &fsl_mpic_42;
+ opp->brr1 = 0x00400402;
+ opp->flags |= OPENPIC_FLAG_ILR;
+ opp->nb_irqs = 196;
+ opp->mpic_mode_mask = GCR_MODE_PROXY;
+
+ fsl_common_init(opp);
+
+ break;
+
+ default:
+ ret = -ENODEV;
+ goto err;
+ }
+
+ ret = mpic_set_default_irq_routing(opp);
+ if (ret)
+ goto err;
+
+ openpic_reset(opp);
+
+ smp_wmb();
+ dev->kvm->arch.mpic = opp;
+
+ return 0;
+
+err:
+ kfree(opp);
+ return ret;
+}
+
+struct kvm_device_ops kvm_mpic_ops = {
+ .name = "kvm-mpic",
+ .create = mpic_create,
+ .destroy = mpic_destroy,
+ .set_attr = mpic_set_attr,
+ .get_attr = mpic_get_attr,
+ .has_attr = mpic_has_attr,
+};
+
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+ u32 cpu)
+{
+ struct openpic *opp = dev->private;
+ int ret = 0;
+
+ if (dev->ops != &kvm_mpic_ops)
+ return -EPERM;
+ if (opp->kvm != vcpu->kvm)
+ return -EPERM;
+ if (cpu < 0 || cpu >= MAX_CPU)
+ return -EPERM;
+
+ spin_lock_irq(&opp->lock);
+
+ if (opp->dst[cpu].vcpu) {
+ ret = -EEXIST;
+ goto out;
+ }
+ if (vcpu->arch.irq_type) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ opp->dst[cpu].vcpu = vcpu;
+ opp->nb_cpus = max(opp->nb_cpus, cpu + 1);
+
+ vcpu->arch.mpic = opp;
+ vcpu->arch.irq_cpu_id = cpu;
+ vcpu->arch.irq_type = KVMPPC_IRQ_MPIC;
+
+ /* This might need to be changed if GCR gets extended */
+ if (opp->mpic_mode_mask == GCR_MODE_PROXY)
+ vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
+
+out:
+ spin_unlock_irq(&opp->lock);
+ return ret;
+}
+
+/*
+ * This should only happen immediately before the mpic is destroyed,
+ * so we shouldn't need to worry about anything still trying to
+ * access the vcpu pointer.
+ */
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu)
+{
+ BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
+
+ opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
+}
+
+/*
+ * Return value:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ */
+static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ u32 irq = e->irqchip.pin;
+ struct openpic *opp = kvm->arch.mpic;
+ unsigned long flags;
+
+ spin_lock_irqsave(&opp->lock, flags);
+ openpic_set_irq(opp, irq, level);
+ spin_unlock_irqrestore(&opp->lock, flags);
+
+ /* All code paths we care about don't check for the return value */
+ return 0;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct openpic *opp = kvm->arch.mpic;
+ unsigned long flags;
+
+ spin_lock_irqsave(&opp->lock, flags);
+
+ /*
+ * XXX We ignore the target address for now, as we only support
+ * a single MSI bank.
+ */
+ openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data);
+ spin_unlock_irqrestore(&opp->lock, flags);
+
+ /* All code paths we care about don't check for the return value */
+ return 0;
+}
+
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ e->set = mpic_set_irq;
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS)
+ goto out;
+ rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+ break;
+ default:
+ goto out;
+ }
+
+ r = 0;
+out:
+ return r;
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a822659db50a..31084c6335c9 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -25,6 +25,7 @@
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/file.h>
#include <asm/cputable.h>
#include <asm/uaccess.h>
#include <asm/kvm_ppc.h>
@@ -32,6 +33,7 @@
#include <asm/cputhreads.h>
#include <asm/irqflags.h>
#include "timing.h"
+#include "irq.h"
#include "../mm/mmu_decl.h"
#define CREATE_TRACE_POINTS
@@ -317,6 +319,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_ONE_REG:
case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_DEVICE_CTRL:
r = 1;
break;
#ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -326,6 +329,9 @@ int kvm_dev_ioctl_check_extension(long ext)
#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
case KVM_CAP_SW_TLB:
#endif
+#ifdef CONFIG_KVM_MPIC
+ case KVM_CAP_IRQ_MPIC:
+#endif
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -335,6 +341,7 @@ int kvm_dev_ioctl_check_extension(long ext)
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_SPAPR_TCE:
case KVM_CAP_PPC_ALLOC_HTAB:
+ case KVM_CAP_PPC_RTAS:
r = 1;
break;
#endif /* CONFIG_PPC_BOOK3S_64 */
@@ -459,6 +466,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
tasklet_kill(&vcpu->arch.tasklet);
kvmppc_remove_vcpu_debugfs(vcpu);
+
+ switch (vcpu->arch.irq_type) {
+ case KVMPPC_IRQ_MPIC:
+ kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
+ break;
+ case KVMPPC_IRQ_XICS:
+ kvmppc_xics_free_icp(vcpu);
+ break;
+ }
+
kvmppc_core_vcpu_free(vcpu);
}
@@ -531,12 +548,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
#endif
}
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg)
-{
- return -EINVAL;
-}
-
static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
@@ -768,7 +779,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
break;
case KVM_CAP_PPC_EPR:
r = 0;
- vcpu->arch.epr_enabled = cap->args[0];
+ if (cap->args[0])
+ vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
+ else
+ vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
break;
#ifdef CONFIG_BOOKE
case KVM_CAP_PPC_BOOKE_WATCHDOG:
@@ -789,6 +803,25 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
break;
}
#endif
+#ifdef CONFIG_KVM_MPIC
+ case KVM_CAP_IRQ_MPIC: {
+ struct file *filp;
+ struct kvm_device *dev;
+
+ r = -EBADF;
+ filp = fget(cap->args[0]);
+ if (!filp)
+ break;
+
+ r = -EPERM;
+ dev = kvm_device_from_filp(filp);
+ if (dev)
+ r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
+
+ fput(filp);
+ break;
+ }
+#endif
default:
r = -EINVAL;
break;
@@ -911,9 +944,22 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
return 0;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
+ struct kvm *kvm __maybe_unused = filp->private_data;
void __user *argp = (void __user *)arg;
long r;
@@ -932,7 +978,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CREATE_SPAPR_TCE: {
struct kvm_create_spapr_tce create_tce;
- struct kvm *kvm = filp->private_data;
r = -EFAULT;
if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
@@ -944,8 +989,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
#ifdef CONFIG_KVM_BOOK3S_64_HV
case KVM_ALLOCATE_RMA: {
- struct kvm *kvm = filp->private_data;
struct kvm_allocate_rma rma;
+ struct kvm *kvm = filp->private_data;
r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
@@ -954,7 +999,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
case KVM_PPC_ALLOCATE_HTAB: {
- struct kvm *kvm = filp->private_data;
u32 htab_order;
r = -EFAULT;
@@ -971,7 +1015,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
case KVM_PPC_GET_HTAB_FD: {
- struct kvm *kvm = filp->private_data;
struct kvm_get_htab_fd ghf;
r = -EFAULT;
@@ -984,7 +1027,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_PPC_GET_SMMU_INFO: {
- struct kvm *kvm = filp->private_data;
struct kvm_ppc_smmu_info info;
memset(&info, 0, sizeof(info));
@@ -993,6 +1035,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
break;
}
+ case KVM_PPC_RTAS_DEFINE_TOKEN: {
+ struct kvm *kvm = filp->private_data;
+
+ r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
+ break;
+ }
#endif /* CONFIG_PPC_BOOK3S_64 */
default:
r = -ENOTTY;
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 48861d3fcd07..20b328bb494d 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -51,6 +51,12 @@ static struct icp_ipl __iomem *icp_native_regs[NR_CPUS];
static inline unsigned int icp_native_get_xirr(void)
{
int cpu = smp_processor_id();
+ unsigned int xirr;
+
+ /* Handled an interrupt latched by KVM */
+ xirr = kvmppc_get_xics_latch();
+ if (xirr)
+ return xirr;
return in_be32(&icp_native_regs[cpu]->xirr.word);
}
@@ -138,6 +144,7 @@ static unsigned int icp_native_get_irq(void)
static void icp_native_cause_ipi(int cpu, unsigned long data)
{
+ kvmppc_set_host_ipi(cpu, 1);
icp_native_set_qirr(cpu, IPI_PRIORITY);
}
@@ -151,6 +158,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
{
int cpu = smp_processor_id();
+ kvmppc_set_host_ipi(cpu, 0);
icp_native_set_qirr(cpu, 0xff);
return smp_ipi_demux();
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 111b4a0c3907..38f2cc0a5a23 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -43,6 +43,8 @@
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
+
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 586f00059805..9d50efdb719d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select MMU_NOTIFIER
select ANON_INODES
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
select KVM_ASYNC_PF
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 04d30401c5cb..a797b8e43ade 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
- assigned-dev.o)
+ assigned-dev.o irqchip.o)
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c522260b5bbf..145b1c81011b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2522,7 +2522,6 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
- case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7bde42470e37..309774715897 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -303,10 +303,10 @@ struct kvm_kernel_irq_routing_entry {
struct hlist_node link;
};
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
struct kvm_irq_routing_table {
- int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
+ int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
struct kvm_kernel_irq_routing_entry *rt_entries;
u32 nr_rt_entries;
/*
@@ -392,6 +392,7 @@ struct kvm {
long mmu_notifier_count;
#endif
long tlbs_dirty;
+ struct list_head devices;
};
#define kvm_err(fmt, ...) \
@@ -431,7 +432,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
int __must_check vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
int kvm_irqfd_init(void);
void kvm_irqfd_exit(void);
#else
@@ -718,11 +719,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask);
-#ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
- union kvm_ioapic_redirect_entry *entry,
- unsigned long *deliver_bitmask);
-#endif
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
bool line_status);
int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
@@ -956,7 +952,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
}
#endif
-#ifdef KVM_CAP_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
#define KVM_MAX_IRQ_ROUTES 1024
@@ -965,6 +961,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
unsigned flags);
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue);
void kvm_free_irq_routing(struct kvm *kvm);
int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
@@ -1065,6 +1064,43 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
extern bool kvm_rebooting;
+struct kvm_device_ops;
+
+struct kvm_device {
+ struct kvm_device_ops *ops;
+ struct kvm *kvm;
+ void *private;
+ struct list_head vm_node;
+};
+
+/* create, destroy, and name are mandatory */
+struct kvm_device_ops {
+ const char *name;
+ int (*create)(struct kvm_device *dev, u32 type);
+
+ /*
+ * Destroy is responsible for freeing dev.
+ *
+ * Destroy may be called before or after destructors are called
+ * on emulated I/O regions, depending on whether a reference is
+ * held by a vcpu or other kvm component that gets destroyed
+ * after the emulated I/O.
+ */
+ void (*destroy)(struct kvm_device *dev);
+
+ int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+ int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+ int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+ long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
+ unsigned long arg);
+};
+
+void kvm_device_get(struct kvm_device *dev);
+void kvm_device_put(struct kvm_device *dev);
+struct kvm_device *kvm_device_from_filp(struct file *filp);
+
+extern struct kvm_device_ops kvm_mpic_ops;
+
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 19911dddaeb7..7005d1109ec9 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -37,7 +37,7 @@ TRACE_EVENT(kvm_userspace_exit,
__entry->errno < 0 ? -__entry->errno : __entry->reason)
);
-#if defined(__KVM_HAVE_IRQ_LINE)
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
TRACE_EVENT(kvm_set_irq,
TP_PROTO(unsigned int gsi, int level, int irq_source_id),
TP_ARGS(gsi, level, irq_source_id),
@@ -122,6 +122,10 @@ TRACE_EVENT(kvm_msi_set_irq,
{KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \
{KVM_IRQCHIP_IOAPIC, "IOAPIC"}
+#endif /* defined(__KVM_HAVE_IOAPIC) */
+
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
+
TRACE_EVENT(kvm_ack_irq,
TP_PROTO(unsigned int irqchip, unsigned int pin),
TP_ARGS(irqchip, pin),
@@ -136,14 +140,18 @@ TRACE_EVENT(kvm_ack_irq,
__entry->pin = pin;
),
+#ifdef kvm_irqchips
TP_printk("irqchip %s pin %u",
__print_symbolic(__entry->irqchip, kvm_irqchips),
__entry->pin)
+#else
+ TP_printk("irqchip %d pin %u", __entry->irqchip, __entry->pin)
+#endif
);
+#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
-#endif /* defined(__KVM_HAVE_IOAPIC) */
#define KVM_TRACE_MMIO_READ_UNSATISFIED 0
#define KVM_TRACE_MMIO_READ 1
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 74d0ff3dfd66..d4005192ad6e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -579,9 +579,7 @@ struct kvm_ppc_smmu_info {
#ifdef __KVM_HAVE_PIT
#define KVM_CAP_REINJECT_CONTROL 24
#endif
-#ifdef __KVM_HAVE_IOAPIC
#define KVM_CAP_IRQ_ROUTING 25
-#endif
#define KVM_CAP_IRQ_INJECT_STATUS 26
#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
#define KVM_CAP_DEVICE_DEASSIGNMENT 27
@@ -668,6 +666,9 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_PPC_EPR 86
#define KVM_CAP_ARM_PSCI 87
#define KVM_CAP_ARM_SET_DEVICE_ADDR 88
+#define KVM_CAP_DEVICE_CTRL 89
+#define KVM_CAP_IRQ_MPIC 90
+#define KVM_CAP_PPC_RTAS 91
#ifdef KVM_CAP_IRQ_ROUTING
@@ -821,6 +822,27 @@ struct kvm_arm_device_addr {
};
/*
+ * Device control API, available with KVM_CAP_DEVICE_CTRL
+ */
+#define KVM_CREATE_DEVICE_TEST 1
+
+struct kvm_create_device {
+ __u32 type; /* in: KVM_DEV_TYPE_xxx */
+ __u32 fd; /* out: device handle */
+ __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */
+};
+
+struct kvm_device_attr {
+ __u32 flags; /* no flags currently defined */
+ __u32 group; /* device-defined */
+ __u64 attr; /* group-defined */
+ __u64 addr; /* userspace address of attr data */
+};
+
+#define KVM_DEV_TYPE_FSL_MPIC_20 1
+#define KVM_DEV_TYPE_FSL_MPIC_42 2
+
+/*
* ioctls for VM fds
*/
#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
@@ -907,6 +929,16 @@ struct kvm_s390_ucas_mapping {
#define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd)
/* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
#define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr)
+/* Available with KVM_CAP_PPC_RTAS */
+#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args)
+
+/* ioctl for vm fd */
+#define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device)
+
+/* ioctls for fds returned by KVM_CREATE_DEVICE */
+#define KVM_SET_DEVICE_ATTR _IOW(KVMIO, 0xe1, struct kvm_device_attr)
+#define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr)
+#define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr)
/*
* ioctls for vcpu fds
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index d01b24b72c61..779262f59e25 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -6,6 +6,9 @@ config HAVE_KVM
config HAVE_KVM_IRQCHIP
bool
+config HAVE_KVM_IRQ_ROUTING
+ bool
+
config HAVE_KVM_EVENTFD
bool
select EVENTFD
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index f4c7f591b5d8..8db43701016f 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -983,36 +983,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
-#ifdef KVM_CAP_IRQ_ROUTING
- case KVM_SET_GSI_ROUTING: {
- struct kvm_irq_routing routing;
- struct kvm_irq_routing __user *urouting;
- struct kvm_irq_routing_entry *entries;
-
- r = -EFAULT;
- if (copy_from_user(&routing, argp, sizeof(routing)))
- goto out;
- r = -EINVAL;
- if (routing.nr >= KVM_MAX_IRQ_ROUTES)
- goto out;
- if (routing.flags)
- goto out;
- r = -ENOMEM;
- entries = vmalloc(routing.nr * sizeof(*entries));
- if (!entries)
- goto out;
- r = -EFAULT;
- urouting = argp;
- if (copy_from_user(entries, urouting->entries,
- routing.nr * sizeof(*entries)))
- goto out_free_irq_routing;
- r = kvm_set_irq_routing(kvm, entries, routing.nr,
- routing.flags);
- out_free_irq_routing:
- vfree(entries);
- break;
- }
-#endif /* KVM_CAP_IRQ_ROUTING */
#ifdef __KVM_HAVE_MSIX
case KVM_ASSIGN_SET_MSIX_NR: {
struct kvm_assigned_msix_nr entry_nr;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c5d43ffbf1f3..64ee720b75c7 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -35,7 +35,7 @@
#include "iodev.h"
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
/*
* --------------------------------------------------------------------
* irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -433,7 +433,7 @@ fail:
void
kvm_eventfd_init(struct kvm *kvm)
{
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
@@ -442,7 +442,7 @@ kvm_eventfd_init(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->ioeventfds);
}
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
/*
* shutdown any irqfd's that match fd+gsi
*/
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 25ab48007adb..e2e6b4473a96 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -151,59 +151,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
return -EWOULDBLOCK;
}
-int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
-{
- struct kvm_kernel_irq_routing_entry route;
-
- if (!irqchip_in_kernel(kvm) || msi->flags != 0)
- return -EINVAL;
-
- route.msi.address_lo = msi->address_lo;
- route.msi.address_hi = msi->address_hi;
- route.msi.data = msi->data;
-
- return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false);
-}
-
-/*
- * Return value:
- * < 0 Interrupt was ignored (masked or not delivered for other reasons)
- * = 0 Interrupt was coalesced (previous irq is still pending)
- * > 0 Number of CPUs interrupt was delivered to
- */
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
- bool line_status)
-{
- struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
- int ret = -1, i = 0;
- struct kvm_irq_routing_table *irq_rt;
-
- trace_kvm_set_irq(irq, level, irq_source_id);
-
- /* Not possible to detect if the guest uses the PIC or the
- * IOAPIC. So set the bit in both. The guest will ignore
- * writes to the unused one.
- */
- rcu_read_lock();
- irq_rt = rcu_dereference(kvm->irq_routing);
- if (irq < irq_rt->nr_rt_entries)
- hlist_for_each_entry(e, &irq_rt->map[irq], link)
- irq_set[i++] = *e;
- rcu_read_unlock();
-
- while(i--) {
- int r;
- r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
- line_status);
- if (r < 0)
- continue;
-
- ret = r + ((ret < 0) ? 0 : ret);
- }
-
- return ret;
-}
-
/*
* Deliver an IRQ in an atomic context if we can, or return a failure,
* user can retry in a process context.
@@ -241,63 +188,6 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
return ret;
}
-bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
- struct kvm_irq_ack_notifier *kian;
- int gsi;
-
- rcu_read_lock();
- gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
- if (gsi != -1)
- hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
- link)
- if (kian->gsi == gsi) {
- rcu_read_unlock();
- return true;
- }
-
- rcu_read_unlock();
-
- return false;
-}
-EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
-
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
- struct kvm_irq_ack_notifier *kian;
- int gsi;
-
- trace_kvm_ack_irq(irqchip, pin);
-
- rcu_read_lock();
- gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
- if (gsi != -1)
- hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
- link)
- if (kian->gsi == gsi)
- kian->irq_acked(kian);
- rcu_read_unlock();
-}
-
-void kvm_register_irq_ack_notifier(struct kvm *kvm,
- struct kvm_irq_ack_notifier *kian)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
- mutex_unlock(&kvm->irq_lock);
- kvm_vcpu_request_scan_ioapic(kvm);
-}
-
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
- struct kvm_irq_ack_notifier *kian)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_del_init_rcu(&kian->link);
- mutex_unlock(&kvm->irq_lock);
- synchronize_rcu();
- kvm_vcpu_request_scan_ioapic(kvm);
-}
-
int kvm_request_irq_source_id(struct kvm *kvm)
{
unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -381,34 +271,14 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
rcu_read_unlock();
}
-void kvm_free_irq_routing(struct kvm *kvm)
-{
- /* Called only during vm destruction. Nobody can use the pointer
- at this stage */
- kfree(kvm->irq_routing);
-}
-
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
- struct kvm_kernel_irq_routing_entry *e,
- const struct kvm_irq_routing_entry *ue)
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
int delta;
unsigned max_pin;
- struct kvm_kernel_irq_routing_entry *ei;
- /*
- * Do not allow GSI to be mapped to the same irqchip more than once.
- * Allow only one to one mapping between GSI and MSI.
- */
- hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
- if (ei->type == KVM_IRQ_ROUTING_MSI ||
- ue->type == KVM_IRQ_ROUTING_MSI ||
- ue->u.irqchip.irqchip == ei->irqchip.irqchip)
- return r;
-
- e->gsi = ue->gsi;
- e->type = ue->type;
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP:
delta = 0;
@@ -445,69 +315,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
goto out;
}
- hlist_add_head(&e->link, &rt->map[e->gsi]);
r = 0;
out:
return r;
}
-
-int kvm_set_irq_routing(struct kvm *kvm,
- const struct kvm_irq_routing_entry *ue,
- unsigned nr,
- unsigned flags)
-{
- struct kvm_irq_routing_table *new, *old;
- u32 i, j, nr_rt_entries = 0;
- int r;
-
- for (i = 0; i < nr; ++i) {
- if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
- return -EINVAL;
- nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
- }
-
- nr_rt_entries += 1;
-
- new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
- + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
- GFP_KERNEL);
-
- if (!new)
- return -ENOMEM;
-
- new->rt_entries = (void *)&new->map[nr_rt_entries];
-
- new->nr_rt_entries = nr_rt_entries;
- for (i = 0; i < 3; i++)
- for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
- new->chip[i][j] = -1;
-
- for (i = 0; i < nr; ++i) {
- r = -EINVAL;
- if (ue->flags)
- goto out;
- r = setup_routing_entry(new, &new->rt_entries[i], ue);
- if (r)
- goto out;
- ++ue;
- }
-
- mutex_lock(&kvm->irq_lock);
- old = kvm->irq_routing;
- kvm_irq_routing_update(kvm, new);
- mutex_unlock(&kvm->irq_lock);
-
- synchronize_rcu();
-
- new = old;
- r = 0;
-
-out:
- kfree(new);
- return r;
-}
-
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
new file mode 100644
index 000000000000..20dc9e4a8f6c
--- /dev/null
+++ b/virt/kvm/irqchip.c
@@ -0,0 +1,237 @@
+/*
+ * irqchip.c: Common API for in kernel interrupt controllers
+ * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (c) 2013, Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This file is derived from virt/kvm/irq_comm.c.
+ *
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ * Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <trace/events/kvm.h>
+#include "irq.h"
+
+bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+ struct kvm_irq_ack_notifier *kian;
+ int gsi;
+
+ rcu_read_lock();
+ gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+ link)
+ if (kian->gsi == gsi) {
+ rcu_read_unlock();
+ return true;
+ }
+
+ rcu_read_unlock();
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+ struct kvm_irq_ack_notifier *kian;
+ int gsi;
+
+ trace_kvm_ack_irq(irqchip, pin);
+
+ rcu_read_lock();
+ gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+ link)
+ if (kian->gsi == gsi)
+ kian->irq_acked(kian);
+ rcu_read_unlock();
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+#ifdef __KVM_HAVE_IOAPIC
+ kvm_vcpu_request_scan_ioapic(kvm);
+#endif
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+ struct kvm_irq_ack_notifier *kian)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_init_rcu(&kian->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_rcu();
+#ifdef __KVM_HAVE_IOAPIC
+ kvm_vcpu_request_scan_ioapic(kvm);
+#endif
+}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ struct kvm_kernel_irq_routing_entry route;
+
+ if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+ return -EINVAL;
+
+ route.msi.address_lo = msi->address_lo;
+ route.msi.address_hi = msi->address_hi;
+ route.msi.data = msi->data;
+
+ return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false);
+}
+
+/*
+ * Return value:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ */
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+ bool line_status)
+{
+ struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
+ int ret = -1, i = 0;
+ struct kvm_irq_routing_table *irq_rt;
+
+ trace_kvm_set_irq(irq, level, irq_source_id);
+
+ /* Not possible to detect if the guest uses the PIC or the
+ * IOAPIC. So set the bit in both. The guest will ignore
+ * writes to the unused one.
+ */
+ rcu_read_lock();
+ irq_rt = rcu_dereference(kvm->irq_routing);
+ if (irq < irq_rt->nr_rt_entries)
+ hlist_for_each_entry(e, &irq_rt->map[irq], link)
+ irq_set[i++] = *e;
+ rcu_read_unlock();
+
+ while(i--) {
+ int r;
+ r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
+ line_status);
+ if (r < 0)
+ continue;
+
+ ret = r + ((ret < 0) ? 0 : ret);
+ }
+
+ return ret;
+}
+
+void kvm_free_irq_routing(struct kvm *kvm)
+{
+ /* Called only during vm destruction. Nobody can use the pointer
+ at this stage */
+ kfree(kvm->irq_routing);
+}
+
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+ struct kvm_kernel_irq_routing_entry *ei;
+
+ /*
+ * Do not allow GSI to be mapped to the same irqchip more than once.
+ * Allow only one to one mapping between GSI and MSI.
+ */
+ hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
+ if (ei->type == KVM_IRQ_ROUTING_MSI ||
+ ue->type == KVM_IRQ_ROUTING_MSI ||
+ ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+ return r;
+
+ e->gsi = ue->gsi;
+ e->type = ue->type;
+ r = kvm_set_routing_entry(rt, e, ue);
+ if (r)
+ goto out;
+
+ hlist_add_head(&e->link, &rt->map[e->gsi]);
+ r = 0;
+out:
+ return r;
+}
+
+int kvm_set_irq_routing(struct kvm *kvm,
+ const struct kvm_irq_routing_entry *ue,
+ unsigned nr,
+ unsigned flags)
+{
+ struct kvm_irq_routing_table *new, *old;
+ u32 i, j, nr_rt_entries = 0;
+ int r;
+
+ for (i = 0; i < nr; ++i) {
+ if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+ return -EINVAL;
+ nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+ }
+
+ nr_rt_entries += 1;
+
+ new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+ + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+ GFP_KERNEL);
+
+ if (!new)
+ return -ENOMEM;
+
+ new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+ new->nr_rt_entries = nr_rt_entries;
+ for (i = 0; i < KVM_NR_IRQCHIPS; i++)
+ for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
+ new->chip[i][j] = -1;
+
+ for (i = 0; i < nr; ++i) {
+ r = -EINVAL;
+ if (ue->flags)
+ goto out;
+ r = setup_routing_entry(new, &new->rt_entries[i], ue);
+ if (r)
+ goto out;
+ ++ue;
+ }
+
+ mutex_lock(&kvm->irq_lock);
+ old = kvm->irq_routing;
+ kvm_irq_routing_update(kvm, new);
+ mutex_unlock(&kvm->irq_lock);
+
+ synchronize_rcu();
+
+ new = old;
+ r = 0;
+
+out:
+ kfree(new);
+ return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aaac1a7a9ea8..5da9f02a2a67 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -504,6 +504,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
+ INIT_LIST_HEAD(&kvm->devices);
r = kvm_init_mmu_notifier(kvm);
if (r)
@@ -581,6 +582,19 @@ void kvm_free_physmem(struct kvm *kvm)
kfree(kvm->memslots);
}
+static void kvm_destroy_devices(struct kvm *kvm)
+{
+ struct list_head *node, *tmp;
+
+ list_for_each_safe(node, tmp, &kvm->devices) {
+ struct kvm_device *dev =
+ list_entry(node, struct kvm_device, vm_node);
+
+ list_del(node);
+ dev->ops->destroy(dev);
+ }
+}
+
static void kvm_destroy_vm(struct kvm *kvm)
{
int i;
@@ -600,6 +614,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_arch_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
+ kvm_destroy_devices(kvm);
kvm_free_physmem(kvm);
cleanup_srcu_struct(&kvm->srcu);
kvm_arch_free_vm(kvm);
@@ -2159,6 +2174,111 @@ out:
}
#endif
+static int kvm_device_ioctl_attr(struct kvm_device *dev,
+ int (*accessor)(struct kvm_device *dev,
+ struct kvm_device_attr *attr),
+ unsigned long arg)
+{
+ struct kvm_device_attr attr;
+
+ if (!accessor)
+ return -EPERM;
+
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ return -EFAULT;
+
+ return accessor(dev, &attr);
+}
+
+static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct kvm_device *dev = filp->private_data;
+
+ switch (ioctl) {
+ case KVM_SET_DEVICE_ATTR:
+ return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
+ case KVM_GET_DEVICE_ATTR:
+ return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
+ case KVM_HAS_DEVICE_ATTR:
+ return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
+ default:
+ if (dev->ops->ioctl)
+ return dev->ops->ioctl(dev, ioctl, arg);
+
+ return -ENOTTY;
+ }
+}
+
+static int kvm_device_release(struct inode *inode, struct file *filp)
+{
+ struct kvm_device *dev = filp->private_data;
+ struct kvm *kvm = dev->kvm;
+
+ kvm_put_kvm(kvm);
+ return 0;
+}
+
+static const struct file_operations kvm_device_fops = {
+ .unlocked_ioctl = kvm_device_ioctl,
+ .release = kvm_device_release,
+};
+
+struct kvm_device *kvm_device_from_filp(struct file *filp)
+{
+ if (filp->f_op != &kvm_device_fops)
+ return NULL;
+
+ return filp->private_data;
+}
+
+static int kvm_ioctl_create_device(struct kvm *kvm,
+ struct kvm_create_device *cd)
+{
+ struct kvm_device_ops *ops = NULL;
+ struct kvm_device *dev;
+ bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+ int ret;
+
+ switch (cd->type) {
+#ifdef CONFIG_KVM_MPIC
+ case KVM_DEV_TYPE_FSL_MPIC_20:
+ case KVM_DEV_TYPE_FSL_MPIC_42:
+ ops = &kvm_mpic_ops;
+ break;
+#endif
+ default:
+ return -ENODEV;
+ }
+
+ if (test)
+ return 0;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->ops = ops;
+ dev->kvm = kvm;
+
+ ret = ops->create(dev, cd->type);
+ if (ret < 0) {
+ kfree(dev);
+ return ret;
+ }
+
+ ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+ if (ret < 0) {
+ ops->destroy(dev);
+ return ret;
+ }
+
+ list_add(&dev->vm_node, &kvm->devices);
+ kvm_get_kvm(kvm);
+ cd->fd = ret;
+ return 0;
+}
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2274,6 +2394,54 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+ case KVM_SET_GSI_ROUTING: {
+ struct kvm_irq_routing routing;
+ struct kvm_irq_routing __user *urouting;
+ struct kvm_irq_routing_entry *entries;
+
+ r = -EFAULT;
+ if (copy_from_user(&routing, argp, sizeof(routing)))
+ goto out;
+ r = -EINVAL;
+ if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+ goto out;
+ if (routing.flags)
+ goto out;
+ r = -ENOMEM;
+ entries = vmalloc(routing.nr * sizeof(*entries));
+ if (!entries)
+ goto out;
+ r = -EFAULT;
+ urouting = argp;
+ if (copy_from_user(entries, urouting->entries,
+ routing.nr * sizeof(*entries)))
+ goto out_free_irq_routing;
+ r = kvm_set_irq_routing(kvm, entries, routing.nr,
+ routing.flags);
+ out_free_irq_routing:
+ vfree(entries);
+ break;
+ }
+#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+ case KVM_CREATE_DEVICE: {
+ struct kvm_create_device cd;
+
+ r = -EFAULT;
+ if (copy_from_user(&cd, argp, sizeof(cd)))
+ goto out;
+
+ r = kvm_ioctl_create_device(kvm, &cd);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &cd, sizeof(cd)))
+ goto out;
+
+ r = 0;
+ break;
+ }
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
if (r == -ENOTTY)
@@ -2403,8 +2571,11 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
#ifdef CONFIG_HAVE_KVM_MSI
case KVM_CAP_SIGNAL_MSI:
#endif
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+ case KVM_CAP_IRQFD_RESAMPLE:
+#endif
return 1;
-#ifdef KVM_CAP_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
case KVM_CAP_IRQ_ROUTING:
return KVM_MAX_IRQ_ROUTES;
#endif