summaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
Diffstat (limited to 'virt')
-rw-r--r--virt/Makefile1
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/arm/arch_timer.c17
-rw-r--r--virt/kvm/arm/arm.c54
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c4
-rw-r--r--virt/kvm/arm/mmu.c133
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c31
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c3
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c4
-rw-r--r--virt/kvm/arm/vgic/vgic.c35
-rw-r--r--virt/kvm/arm/vgic/vgic.h1
-rw-r--r--virt/kvm/eventfd.c6
-rw-r--r--virt/kvm/irqchip.c5
-rw-r--r--virt/kvm/kvm_main.c122
-rw-r--r--virt/lib/Kconfig1
-rw-r--r--virt/lib/Makefile1
16 files changed, 308 insertions, 113 deletions
diff --git a/virt/Makefile b/virt/Makefile
index be783472ac81..1cfea9436af9 100644
--- a/virt/Makefile
+++ b/virt/Makefile
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-y += lib/
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index ea434ddc8499..aad9284c043a 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -57,3 +57,6 @@ config HAVE_KVM_VCPU_ASYNC_IOCTL
config HAVE_KVM_VCPU_RUN_PID_CHANGE
bool
+
+config HAVE_KVM_NO_POLL
+ bool
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 3417f2dbc366..7fc272ecae16 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -508,6 +508,14 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
/*
+ * Update the timer output so that it is likely to match the
+ * state we're about to restore. If the timer expires between
+ * this point and the register restoration, we'll take the
+ * interrupt anyway.
+ */
+ kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
+
+ /*
* When using a userspace irqchip with the architected timers and a
* host interrupt controller that doesn't support an active state, we
* must still prevent continuously exiting from the guest, and
@@ -730,7 +738,6 @@ static void kvm_timer_init_interrupt(void *info)
int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
{
struct arch_timer_context *timer;
- bool level;
switch (regid) {
case KVM_REG_ARM_TIMER_CTL:
@@ -758,10 +765,6 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
return -1;
}
- level = kvm_timer_should_fire(timer);
- kvm_timer_update_irq(vcpu, level, timer);
- timer_emulate(timer);
-
return 0;
}
@@ -812,7 +815,7 @@ static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
switch (treg) {
case TIMER_REG_TVAL:
- val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval;
+ val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
break;
case TIMER_REG_CTL:
@@ -858,7 +861,7 @@ static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
{
switch (treg) {
case TIMER_REG_TVAL:
- timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff;
+ timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val;
break;
case TIMER_REG_CTL:
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 99c37384ba7b..90cedebaeb94 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -56,7 +56,7 @@
__asm__(".arch_extension virt");
#endif
-DEFINE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
+DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
/* Per-CPU variable containing the currently running vcpu. */
@@ -224,9 +224,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
- case KVM_CAP_NR_MEMSLOTS:
- r = KVM_USER_MEM_SLOTS;
- break;
case KVM_CAP_MSI_DEVID:
if (!kvm)
r = -EINVAL;
@@ -360,8 +357,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
int *last_ran;
+ kvm_host_data_t *cpu_data;
last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
+ cpu_data = this_cpu_ptr(&kvm_host_data);
/*
* We might get preempted before the vCPU actually runs, but
@@ -373,18 +372,21 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
vcpu->cpu = cpu;
- vcpu->arch.host_cpu_context = this_cpu_ptr(&kvm_host_cpu_state);
+ vcpu->arch.host_cpu_context = &cpu_data->host_ctxt;
kvm_arm_set_running_vcpu(vcpu);
kvm_vgic_load(vcpu);
kvm_timer_vcpu_load(vcpu);
kvm_vcpu_load_sysregs(vcpu);
kvm_arch_vcpu_load_fp(vcpu);
+ kvm_vcpu_pmu_restore_guest(vcpu);
if (single_task_running())
vcpu_clear_wfe_traps(vcpu);
else
vcpu_set_wfe_traps(vcpu);
+
+ vcpu_ptrauth_setup_lazy(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -393,6 +395,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_vcpu_put_sysregs(vcpu);
kvm_timer_vcpu_put(vcpu);
kvm_vgic_put(vcpu);
+ kvm_vcpu_pmu_restore_host(vcpu);
vcpu->cpu = -1;
@@ -545,6 +548,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
if (likely(vcpu->arch.has_run_once))
return 0;
+ if (!kvm_arm_vcpu_is_finalized(vcpu))
+ return -EPERM;
+
vcpu->arch.has_run_once = true;
if (likely(irqchip_in_kernel(kvm))) {
@@ -934,7 +940,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
- unsigned int i;
+ unsigned int i, ret;
int phys_target = kvm_target_cpu();
if (init->target != phys_target)
@@ -969,9 +975,14 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
vcpu->arch.target = phys_target;
/* Now we know what it is, we can reset it. */
- return kvm_reset_vcpu(vcpu);
-}
+ ret = kvm_reset_vcpu(vcpu);
+ if (ret) {
+ vcpu->arch.target = -1;
+ bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+ }
+ return ret;
+}
static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm_vcpu_init *init)
@@ -1116,6 +1127,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (unlikely(!kvm_vcpu_initialized(vcpu)))
break;
+ r = -EPERM;
+ if (!kvm_arm_vcpu_is_finalized(vcpu))
+ break;
+
r = -EFAULT;
if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
break;
@@ -1169,6 +1184,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
return kvm_arm_vcpu_set_events(vcpu, &events);
}
+ case KVM_ARM_VCPU_FINALIZE: {
+ int what;
+
+ if (!kvm_vcpu_initialized(vcpu))
+ return -ENOEXEC;
+
+ if (get_user(what, (const int __user *)argp))
+ return -EFAULT;
+
+ return kvm_arm_vcpu_finalize(vcpu, what);
+ }
default:
r = -EINVAL;
}
@@ -1549,11 +1575,11 @@ static int init_hyp_mode(void)
}
for_each_possible_cpu(cpu) {
- kvm_cpu_context_t *cpu_ctxt;
+ kvm_host_data_t *cpu_data;
- cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu);
- kvm_init_host_cpu_context(cpu_ctxt, cpu);
- err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
+ cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
+ kvm_init_host_cpu_context(&cpu_data->host_ctxt, cpu);
+ err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
if (err) {
kvm_err("Cannot map host CPU state: %d\n", err);
@@ -1664,6 +1690,10 @@ int kvm_arch_init(void *opaque)
if (err)
return err;
+ err = kvm_arm_init_sve();
+ if (err)
+ return err;
+
if (!in_hyp_mode) {
err = init_hyp_mode();
if (err)
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 264d92da3240..370bd6c5e6cb 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -222,7 +222,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
}
}
- if (used_lrs) {
+ if (used_lrs || cpu_if->its_vpe.its_vm) {
int i;
u32 elrsr;
@@ -247,7 +247,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
int i;
- if (used_lrs) {
+ if (used_lrs || cpu_if->its_vpe.its_vm) {
write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
for (i = 0; i < used_lrs; i++)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index ffd7acdceac7..74b6582eaa3c 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
* @addr: IPA
* @pmd: pmd pointer for IPA
*
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
- * pages in the range dirty.
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
{
@@ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
* @addr: IPA
* @pud: pud pointer for IPA
*
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
- * pages in the range dirty.
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
{
@@ -191,7 +189,7 @@ static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr
VM_BUG_ON(pmd_thp_or_huge(*pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
- pte_free_kernel(NULL, pte_table);
+ free_page((unsigned long)pte_table);
put_page(virt_to_page(pmd));
}
@@ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
* @kvm: The KVM struct pointer for the VM.
*
- * Allocates only the stage-2 HW PGD level table(s) (can support either full
- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
- * allocated pages.
+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
+ * stage2_pgd_size(kvm).
*
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
@@ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
{
pmd_t *pmd, old_pmd;
+retry:
pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd);
old_pmd = *pmd;
+ /*
+ * Multiple vcpus faulting on the same PMD entry, can
+ * lead to them sequentially updating the PMD with the
+ * same value. Following the break-before-make
+ * (pmd_clear() followed by tlb_flush()) process can
+ * hinder forward progress due to refaults generated
+ * on missing translations.
+ *
+ * Skip updating the page table if the entry is
+ * unchanged.
+ */
+ if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+ return 0;
+
if (pmd_present(old_pmd)) {
/*
- * Multiple vcpus faulting on the same PMD entry, can
- * lead to them sequentially updating the PMD with the
- * same value. Following the break-before-make
- * (pmd_clear() followed by tlb_flush()) process can
- * hinder forward progress due to refaults generated
- * on missing translations.
+ * If we already have PTE level mapping for this block,
+ * we must unmap it to avoid inconsistent TLB state and
+ * leaking the table page. We could end up in this situation
+ * if the memory slot was marked for dirty logging and was
+ * reverted, leaving PTE level mappings for the pages accessed
+ * during the period. So, unmap the PTE level mapping for this
+ * block and retry, as we could have released the upper level
+ * table in the process.
*
- * Skip updating the page table if the entry is
- * unchanged.
+ * Normal THP split/merge follows mmu_notifier callbacks and do
+ * get handled accordingly.
*/
- if (pmd_val(old_pmd) == pmd_val(*new_pmd))
- return 0;
-
+ if (!pmd_thp_or_huge(old_pmd)) {
+ unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
+ goto retry;
+ }
/*
* Mapping in huge pages should only happen through a
* fault. If a page is merged into a transparent huge
@@ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
* should become splitting first, unmapped, merged,
* and mapped back in on-demand.
*/
- VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-
+ WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
@@ -1114,6 +1128,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
{
pud_t *pudp, old_pud;
+retry:
pudp = stage2_get_pud(kvm, cache, addr);
VM_BUG_ON(!pudp);
@@ -1121,14 +1136,23 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
/*
* A large number of vcpus faulting on the same stage 2 entry,
- * can lead to a refault due to the
- * stage2_pud_clear()/tlb_flush(). Skip updating the page
- * tables if there is no change.
+ * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+ * Skip updating the page tables if there is no change.
*/
if (pud_val(old_pud) == pud_val(*new_pudp))
return 0;
if (stage2_pud_present(kvm, old_pud)) {
+ /*
+ * If we already have table level mapping for this block, unmap
+ * the range for this block and retry.
+ */
+ if (!stage2_pud_huge(kvm, old_pud)) {
+ unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
+ goto retry;
+ }
+
+ WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
stage2_pud_clear(kvm, pudp);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
@@ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
}
/**
- * stage2_wp_puds - write protect PGD range
- * @pgd: pointer to pgd entry
- * @addr: range start address
- * @end: range end address
- *
- * Process PUD entries, for a huge PUD we cause a panic.
- */
+ * stage2_wp_puds - write protect PGD range
+ * @pgd: pointer to pgd entry
+ * @addr: range start address
+ * @end: range end address
+ */
static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
phys_addr_t addr, phys_addr_t end)
{
@@ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address,
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
}
-static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
- unsigned long hva)
+static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
+ unsigned long hva,
+ unsigned long map_size)
{
gpa_t gpa_start;
hva_t uaddr_start, uaddr_end;
@@ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
/*
* Pages belonging to memslots that don't have the same alignment
- * within a PMD for userspace and IPA cannot be mapped with stage-2
- * PMD entries, because we'll end up mapping the wrong pages.
+ * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
+ * PMD/PUD entries, because we'll end up mapping the wrong pages.
*
* Consider a layout like the following:
*
* memslot->userspace_addr:
* +-----+--------------------+--------------------+---+
- * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
* +-----+--------------------+--------------------+---+
*
* memslot->base_gfn << PAGE_SIZE:
* +---+--------------------+--------------------+-----+
- * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
+ * |abc|def Stage-2 block | Stage-2 block |tvxyz|
* +---+--------------------+--------------------+-----+
*
- * If we create those stage-2 PMDs, we'll end up with this incorrect
+ * If we create those stage-2 blocks, we'll end up with this incorrect
* mapping:
* d -> f
* e -> g
* f -> h
*/
- if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
+ if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
return false;
/*
* Next, let's make sure we're not trying to map anything not covered
- * by the memslot. This means we have to prohibit PMD size mappings
- * for the beginning and end of a non-PMD aligned and non-PMD sized
+ * by the memslot. This means we have to prohibit block size mappings
+ * for the beginning and end of a non-block aligned and non-block sized
* memory slot (illustrated by the head and tail parts of the
* userspace view above containing pages 'abcde' and 'xyz',
* respectively).
@@ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
* userspace_addr or the base_gfn, as both are equally aligned (per
* the check above) and equally sized.
*/
- return (hva & S2_PMD_MASK) >= uaddr_start &&
- (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
+ return (hva & ~(map_size - 1)) >= uaddr_start &&
+ (hva & ~(map_size - 1)) + map_size <= uaddr_end;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- if (!fault_supports_stage2_pmd_mappings(memslot, hva))
- force_pte = true;
-
- if (logging_active)
- force_pte = true;
-
/* Let's check if we will get back a huge page backed by hugetlbfs */
down_read(&current->mm->mmap_sem);
vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1692,6 +1709,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
vma_pagesize = vma_kernel_pagesize(vma);
+ if (logging_active ||
+ !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ force_pte = true;
+ vma_pagesize = PAGE_SIZE;
+ }
+
/*
* The stage2 has a minimum of 2 level table (For arm64 see
* kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
@@ -1699,11 +1722,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* As for PUD huge maps, we must make sure that we have at least
* 3 levels, i.e, PMD is not folded.
*/
- if ((vma_pagesize == PMD_SIZE ||
- (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) &&
- !force_pte) {
+ if (vma_pagesize == PMD_SIZE ||
+ (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
- }
up_read(&current->mm->mmap_sem);
/* We need minimum second+third level pages */
@@ -1760,8 +1781,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Only PMD_SIZE transparent hugepages(THP) are
* currently supported. This code will need to be
* updated to support other THP sizes.
+ *
+ * Make sure the host VA and the guest IPA are sufficiently
+ * aligned and that the block is contained within the memslot.
*/
- if (transparent_hugepage_adjust(&pfn, &fault_ipa))
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+ transparent_hugepage_adjust(&pfn, &fault_ipa))
vma_pagesize = PMD_SIZE;
}
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index ab3f47745d9c..44ceaccb18cf 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -754,8 +754,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
int esz = GITS_BASER_ENTRY_SIZE(baser);
- int index;
+ int index, idx;
gfn_t gfn;
+ bool ret;
switch (type) {
case GITS_BASER_TYPE_DEVICE:
@@ -782,7 +783,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
if (eaddr)
*eaddr = addr;
- return kvm_is_visible_gfn(its->dev->kvm, gfn);
+
+ goto out;
}
/* calculate and check the index into the 1st level */
@@ -812,7 +814,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
if (eaddr)
*eaddr = indirect_ptr;
- return kvm_is_visible_gfn(its->dev->kvm, gfn);
+
+out:
+ idx = srcu_read_lock(&its->dev->kvm->srcu);
+ ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
+ srcu_read_unlock(&its->dev->kvm->srcu, idx);
+ return ret;
}
static int vgic_its_alloc_collection(struct vgic_its *its,
@@ -1729,8 +1736,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev)
kfree(its);
}
-int vgic_its_has_attr_regs(struct kvm_device *dev,
- struct kvm_device_attr *attr)
+static int vgic_its_has_attr_regs(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
{
const struct vgic_register_region *region;
gpa_t offset = attr->attr;
@@ -1750,9 +1757,9 @@ int vgic_its_has_attr_regs(struct kvm_device *dev,
return 0;
}
-int vgic_its_attr_regs_access(struct kvm_device *dev,
- struct kvm_device_attr *attr,
- u64 *reg, bool is_write)
+static int vgic_its_attr_regs_access(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ u64 *reg, bool is_write)
{
const struct vgic_register_region *region;
struct vgic_its *its;
@@ -1919,7 +1926,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
ite->collection->collection_id;
val = cpu_to_le64(val);
- return kvm_write_guest(kvm, gpa, &val, ite_esz);
+ return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
}
/**
@@ -2066,7 +2073,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
(itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
(dev->num_eventid_bits - 1));
val = cpu_to_le64(val);
- return kvm_write_guest(kvm, ptr, &val, dte_esz);
+ return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
}
/**
@@ -2246,7 +2253,7 @@ static int vgic_its_save_cte(struct vgic_its *its,
((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
collection->collection_id);
val = cpu_to_le64(val);
- return kvm_write_guest(its->dev->kvm, gpa, &val, esz);
+ return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
}
static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
@@ -2317,7 +2324,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
*/
val = 0;
BUG_ON(cte_esz > sizeof(val));
- ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz);
+ ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
return ret;
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 4a12322bf7df..9f4843fe9cda 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -200,6 +200,9 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+ if (was_enabled && !vgic_cpu->lpis_enabled)
+ vgic_flush_pending_lpis(vcpu);
+
if (!was_enabled && vgic_cpu->lpis_enabled)
vgic_enable_lpis(vcpu);
}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 408a78eb6a97..9f87e58dbd4a 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -358,7 +358,7 @@ retry:
if (status) {
/* clear consumed data */
val &= ~(1 << bit_nr);
- ret = kvm_write_guest(kvm, ptr, &val, 1);
+ ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
return ret;
}
@@ -409,7 +409,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
else
val &= ~(1 << bit_nr);
- ret = kvm_write_guest(kvm, ptr, &val, 1);
+ ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
return ret;
}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index abd9c7352677..191deccf60bf 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -151,6 +151,27 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
kfree(irq);
}
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq, *tmp;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+ list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+ if (irq->intid >= VGIC_MIN_LPI) {
+ raw_spin_lock(&irq->irq_lock);
+ list_del(&irq->ap_list);
+ irq->vcpu = NULL;
+ raw_spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
+
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
{
WARN_ON(irq_set_irqchip_state(irq->host_irq,
@@ -867,15 +888,21 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
* either observe the new interrupt before or after doing this check,
* and introducing additional synchronization mechanism doesn't change
* this.
+ *
+ * Note that we still need to go through the whole thing if anything
+ * can be directly injected (GICv4).
*/
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
+ !vgic_supports_direct_msis(vcpu->kvm))
return;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
- vgic_flush_lr_state(vcpu);
- raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
+ raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ vgic_flush_lr_state(vcpu);
+ raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ }
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index a90024718ca4..abeeffabc456 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -238,6 +238,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu);
bool vgic_has_its(struct kvm *kvm);
int kvm_vgic_register_its_device(void);
void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 4325250afd72..001aeda4c154 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -214,9 +214,9 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
if (flags & EPOLLHUP) {
/* The eventfd is closing, detach from KVM */
- unsigned long flags;
+ unsigned long iflags;
- spin_lock_irqsave(&kvm->irqfds.lock, flags);
+ spin_lock_irqsave(&kvm->irqfds.lock, iflags);
/*
* We must check if someone deactivated the irqfd before
@@ -230,7 +230,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
if (irqfd_is_active(irqfd))
irqfd_deactivate(irqfd);
- spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
+ spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
}
return 0;
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 3547b0d8c91e..79e59e4fa3dc 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -144,18 +144,19 @@ static int setup_routing_entry(struct kvm *kvm,
{
struct kvm_kernel_irq_routing_entry *ei;
int r;
+ u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES);
/*
* Do not allow GSI to be mapped to the same irqchip more than once.
* Allow only one to one mapping between GSI and non-irqchip routing.
*/
- hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
+ hlist_for_each_entry(ei, &rt->map[gsi], link)
if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return -EINVAL;
- e->gsi = ue->gsi;
+ e->gsi = gsi;
e->type = ue->type;
r = kvm_set_routing_entry(kvm, e, ue);
if (r)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f25aa98a94df..f0d13d9d125d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,9 +51,9 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/bsearch.h>
+#include <linux/io.h>
#include <asm/processor.h>
-#include <asm/io.h>
#include <asm/ioctl.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
@@ -391,7 +391,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
spin_unlock(&kvm->mmu_lock);
ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
- range->end, range->blockable);
+ range->end,
+ mmu_notifier_range_blockable(range));
srcu_read_unlock(&kvm->srcu, idx);
@@ -1134,11 +1135,11 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
/**
- * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages
* and reenable dirty page tracking for the corresponding pages.
* @kvm: pointer to kvm instance
* @log: slot id and address to which we copy the log
- * @is_dirty: flag set if any page is dirty
+ * @flush: true if TLB flush is needed by caller
*
* We need to keep it in mind that VCPU threads can write to the bitmap
* concurrently. So, to avoid losing track of dirty pages we keep the
@@ -1223,6 +1224,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
* and reenable dirty page tracking for the corresponding pages.
* @kvm: pointer to kvm instance
* @log: slot id and address from which to fetch the bitmap of dirty pages
+ * @flush: true if TLB flush is needed by caller
*/
int kvm_clear_dirty_log_protect(struct kvm *kvm,
struct kvm_clear_dirty_log *log, bool *flush)
@@ -1240,7 +1242,7 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
return -EINVAL;
- if ((log->first_page & 63) || (log->num_pages & 63))
+ if (log->first_page & 63)
return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
@@ -1250,11 +1252,12 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
if (!dirty_bitmap)
return -ENOENT;
- n = kvm_dirty_bitmap_bytes(memslot);
+ n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
if (log->first_page > memslot->npages ||
- log->num_pages > memslot->npages - log->first_page)
- return -EINVAL;
+ log->num_pages > memslot->npages - log->first_page ||
+ (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
+ return -EINVAL;
*flush = false;
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
@@ -1262,8 +1265,8 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
return -EFAULT;
spin_lock(&kvm->mmu_lock);
- for (offset = log->first_page,
- i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
+ for (offset = log->first_page, i = offset / BITS_PER_LONG,
+ n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
i++, offset += BITS_PER_LONG) {
unsigned long mask = *dirty_bitmap_buffer++;
atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
@@ -1740,6 +1743,70 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_page);
+static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
+ struct kvm_host_map *map)
+{
+ kvm_pfn_t pfn;
+ void *hva = NULL;
+ struct page *page = KVM_UNMAPPED_PAGE;
+
+ if (!map)
+ return -EINVAL;
+
+ pfn = gfn_to_pfn_memslot(slot, gfn);
+ if (is_error_noslot_pfn(pfn))
+ return -EINVAL;
+
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ hva = kmap(page);
+ } else {
+ hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+ }
+
+ if (!hva)
+ return -EFAULT;
+
+ map->page = page;
+ map->hva = hva;
+ map->pfn = pfn;
+ map->gfn = gfn;
+
+ return 0;
+}
+
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+{
+ return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
+ bool dirty)
+{
+ if (!map)
+ return;
+
+ if (!map->hva)
+ return;
+
+ if (map->page)
+ kunmap(map->page);
+ else
+ memunmap(map->hva);
+
+ if (dirty) {
+ kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
+ kvm_release_pfn_dirty(map->pfn);
+ } else {
+ kvm_release_pfn_clean(map->pfn);
+ }
+
+ map->hva = NULL;
+ map->page = NULL;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+
struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
{
kvm_pfn_t pfn;
@@ -2253,7 +2320,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
u64 block_ns;
start = cur = ktime_get();
- if (vcpu->halt_poll_ns) {
+ if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
++vcpu->stat.halt_attempted_poll;
@@ -2884,6 +2951,16 @@ out:
}
#endif
+static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct kvm_device *dev = filp->private_data;
+
+ if (dev->ops->mmap)
+ return dev->ops->mmap(dev, vma);
+
+ return -ENODEV;
+}
+
static int kvm_device_ioctl_attr(struct kvm_device *dev,
int (*accessor)(struct kvm_device *dev,
struct kvm_device_attr *attr),
@@ -2905,6 +2982,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
{
struct kvm_device *dev = filp->private_data;
+ if (dev->kvm->mm != current->mm)
+ return -EIO;
+
switch (ioctl) {
case KVM_SET_DEVICE_ATTR:
return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
@@ -2925,6 +3005,13 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
struct kvm_device *dev = filp->private_data;
struct kvm *kvm = dev->kvm;
+ if (dev->ops->release) {
+ mutex_lock(&kvm->lock);
+ list_del(&dev->vm_node);
+ dev->ops->release(dev);
+ mutex_unlock(&kvm->lock);
+ }
+
kvm_put_kvm(kvm);
return 0;
}
@@ -2933,6 +3020,7 @@ static const struct file_operations kvm_device_fops = {
.unlocked_ioctl = kvm_device_ioctl,
.release = kvm_device_release,
KVM_COMPAT(kvm_device_ioctl),
+ .mmap = kvm_device_mmap,
};
struct kvm_device *kvm_device_from_filp(struct file *filp)
@@ -2974,12 +3062,14 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
struct kvm_device_ops *ops = NULL;
struct kvm_device *dev;
bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+ int type;
int ret;
if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
return -ENODEV;
- ops = kvm_device_ops_table[cd->type];
+ type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
+ ops = kvm_device_ops_table[type];
if (ops == NULL)
return -ENODEV;
@@ -2994,7 +3084,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
dev->kvm = kvm;
mutex_lock(&kvm->lock);
- ret = ops->create(dev, cd->type);
+ ret = ops->create(dev, type);
if (ret < 0) {
mutex_unlock(&kvm->lock);
kfree(dev);
@@ -3039,7 +3129,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_CHECK_EXTENSION_VM:
case KVM_CAP_ENABLE_CAP_VM:
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
- case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
#endif
return 1;
#ifdef CONFIG_KVM_MMIO
@@ -3058,6 +3148,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
#endif
case KVM_CAP_MAX_VCPU_ID:
return KVM_MAX_VCPU_ID;
+ case KVM_CAP_NR_MEMSLOTS:
+ return KVM_USER_MEM_SLOTS;
default:
break;
}
@@ -3075,7 +3167,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
{
switch (cap->cap) {
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
- case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
if (cap->flags || (cap->args[0] & ~1))
return -EINVAL;
kvm->manual_dirty_log_protect = cap->args[0];
diff --git a/virt/lib/Kconfig b/virt/lib/Kconfig
index 89a414f815d2..2d9523b7155e 100644
--- a/virt/lib/Kconfig
+++ b/virt/lib/Kconfig
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
config IRQ_BYPASS_MANAGER
tristate
diff --git a/virt/lib/Makefile b/virt/lib/Makefile
index 901228d1ffbc..bd7f9a78bb6b 100644
--- a/virt/lib/Makefile
+++ b/virt/lib/Makefile
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o