diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 1915 |
1 files changed, 1228 insertions, 687 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fa89c39c64f..2288ad829b32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,6 +6,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -41,17 +42,21 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/perf_event.h> +#include <linux/uaccess.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS #include "trace.h" #include <asm/debugreg.h> -#include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/mce.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/pvclock.h> +#include <asm/div64.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -62,12 +67,13 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P +#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -147,6 +153,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +u64 __read_mostly host_xcr0; + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; @@ -271,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, u32 prev_nr; int class1, class2; + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (!vcpu->arch.exception.pending) { queue: vcpu->arch.exception.pending = true; @@ -285,7 +300,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); @@ -316,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, - u32 error_code) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu) { + unsigned error_code = vcpu->arch.fault.error_code; + ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = addr; + vcpu->arch.cr2 = vcpu->arch.fault.address; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } +void kvm_propagate_fault(struct kvm_vcpu *vcpu) +{ + if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) + vcpu->arch.nested_mmu.inject_page_fault(vcpu); + else + vcpu->arch.mmu.inject_page_fault(vcpu); + + vcpu->arch.fault.nested = false; +} + void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -356,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) EXPORT_SYMBOL_GPL(kvm_require_cpl); /* + * This function will be used to read from the physical memory of the currently + * running guest. The difference to kvm_read_guest_page is that this function + * can read from guest physical or from the guest's guest physical memory. + */ +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t ngfn, void *data, int offset, int len, + u32 access) +{ + gfn_t real_gfn; + gpa_t ngpa; + + ngpa = gfn_to_gpa(ngfn); + real_gfn = mmu->translate_gpa(vcpu, ngpa, access); + if (real_gfn == UNMAPPED_GVA) + return -EFAULT; + + real_gfn = gpa_to_gfn(real_gfn); + + return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); + +int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, + void *data, int offset, int len, u32 access) +{ + return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, + data, offset, len, access); +} + +/* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte)); + ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte), + PFERR_USER_MASK|PFERR_WRITE_MASK); if (ret < 0) { ret = 0; goto out; @@ -381,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } ret = 1; - memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, @@ -394,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; + int offset; + gfn_t gfn; int r; if (is_long_mode(vcpu) || !is_pae(vcpu)) @@ -405,130 +465,177 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; + offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); + r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), + PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) goto out; - changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; + changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; out: return changed; } -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | + X86_CR0_CD | X86_CR0_NW; + cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr0 & 0xffffffff00000000UL) + return 1; #endif cr0 &= ~CR0_RESERVED_BITS; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return 1; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return 1; if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; - if (!is_pae(vcpu)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!is_pae(vcpu)) + return 1; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) { - kvm_inject_gp(vcpu, 0); - return; - - } + if (cs_l) + return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } - + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.cr3)) + return 1; } kvm_x86_ops->set_cr0(vcpu, cr0); - kvm_mmu_reset_context(vcpu); - return; + if ((cr0 ^ old_cr0) & update_bits) + kvm_mmu_reset_context(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + u64 xcr0; + + /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + xcr0 = xcr; + if (kvm_x86_ops->get_cpl(vcpu) != 0) + return 1; + if (!(xcr0 & XSTATE_FP)) + return 1; + if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + return 1; + if (xcr0 & ~host_xcr0) + return 1; + vcpu->arch.xcr0 = xcr0; + vcpu->guest_xcr0_loaded = 0; + return 0; +} - if (cr4 & CR4_RESERVED_BITS) { +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + if (__kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_xcr); + +static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static void update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) return; + + /* Update OSXSAVE bit */ + if (cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); } +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + + if (cr4 & CR4_RESERVED_BITS) + return 1; + + if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) + return 1; if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!(cr4 & X86_CR4_PAE)) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) + return 1; + + if (cr4 & X86_CR4_VMXE) + return 1; - if (cr4 & X86_CR4_VMXE) { - kvm_inject_gp(vcpu, 0); - return; - } kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->arch.cr4 = cr4; - kvm_mmu_reset_context(vcpu); + + if ((cr4 ^ old_cr4) & pdptr_bits) + kvm_mmu_reset_context(vcpu); + + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr4); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); - return; + return 0; } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_PAE_RESERVED_BITS) + return 1; + if (is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + return 1; } /* * We don't check reserved bits in nonpae mode, because @@ -546,24 +653,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - kvm_inject_gp(vcpu, 0); - else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); - } + return 1; + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if (cr8 & CR8_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr8 & CR8_RESERVED_BITS) + return 1; if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; + return 0; +} + +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (__kvm_set_cr8(vcpu, cr8)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -576,7 +687,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { case 0 ... 3: @@ -585,29 +696,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); @@ -618,28 +721,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } + +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + int res; + + res = __kvm_set_dr(vcpu, dr, val); + if (res > 0) + kvm_queue_exception(vcpu, UD_VECTOR); + else if (res < 0) + kvm_inject_gp(vcpu, 0); + + return res; +} EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { switch (dr) { case 0 ... 3: *val = vcpu->arch.db[dr]; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ case 6: *val = vcpu->arch.dr6; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; @@ -648,12 +760,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dr); -static inline u32 bit(int bitno) +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { - return 1 << (bitno & 31); + if (_kvm_get_dr(vcpu, dr, val)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + return 0; } +EXPORT_SYMBOL_GPL(kvm_get_dr); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS @@ -671,21 +787,25 @@ static u32 msrs_to_save[] = { HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, + MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, }; static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { + u64 old_efer = vcpu->arch.efer; + if (efer & efer_reserved_bits) return 1; @@ -714,11 +834,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.efer = efer; - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + /* Update reserved bits */ + if ((efer ^ old_efer) & EFER_NX) + kvm_mmu_reset_context(vcpu); + return 0; } @@ -770,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding - * system time (updated by kvm_write_guest_time below) to the + * system time (updated by kvm_guest_time_update below) to the * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ @@ -798,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) return quotient; } -static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, + s8 *pshift, u32 *pmultiplier) { - uint64_t nsecs = 1000000000LL; + uint64_t scaled64; int32_t shift = 0; uint64_t tps64; uint32_t tps32; - tps64 = tsc_khz * 1000LL; - while (tps64 > nsecs*2) { + tps64 = base_khz * 1000LL; + scaled64 = scaled_khz * 1000LL; + while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= (uint32_t)nsecs) { - tps32 <<= 1; + while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { + if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) + scaled64 >>= 1; + else + tps32 <<= 1; shift++; } - hv_clock->tsc_shift = shift; - hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + *pshift = shift; + *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __func__, tsc_khz, hv_clock->tsc_shift, - hv_clock->tsc_to_system_mul); + pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", + __func__, base_khz, scaled_khz, shift, *pmultiplier); +} + +static inline u64 get_kernel_ns(void) +{ + struct timespec ts; + + WARN_ON(preemptible()); + ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); + return timespec_to_ns(&ts); } static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +unsigned long max_tsc_khz; -static void kvm_write_guest_time(struct kvm_vcpu *v) +static inline int kvm_tsc_changes_freq(void) +{ + int cpu = get_cpu(); + int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + cpufreq_quick_get(cpu) != 0; + put_cpu(); + return ret; +} + +static inline u64 nsec_to_cycles(u64 nsec) +{ + u64 ret; + + WARN_ON(preemptible()); + if (kvm_tsc_changes_freq()) + printk_once(KERN_WARNING + "kvm: unreliable cycle conversion on adjustable rate TSC\n"); + ret = nsec * __get_cpu_var(cpu_tsc_khz); + do_div(ret, USEC_PER_SEC); + return ret; +} + +static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +{ + /* Compute a scale to convert nanoseconds in TSC cycles */ + kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + &kvm->arch.virtual_tsc_shift, + &kvm->arch.virtual_tsc_mult); + kvm->arch.virtual_tsc_khz = this_tsc_khz; +} + +static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) +{ + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + vcpu->kvm->arch.virtual_tsc_mult, + vcpu->kvm->arch.virtual_tsc_shift); + tsc += vcpu->arch.last_tsc_write; + return tsc; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + u64 offset, ns, elapsed; + unsigned long flags; + s64 sdiff; + + spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = data - native_read_tsc(); + ns = get_kernel_ns(); + elapsed = ns - kvm->arch.last_tsc_nsec; + sdiff = data - kvm->arch.last_tsc_write; + if (sdiff < 0) + sdiff = -sdiff; + + /* + * Special case: close write to TSC within 5 seconds of + * another CPU is interpreted as an attempt to synchronize + * The 5 seconds is to accomodate host load / swapping as + * well as any reset of TSC during the boot process. + * + * In that case, for a reliable TSC, we can match TSC offsets, + * or make a best guest using elapsed value. + */ + if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + elapsed < 5ULL * NSEC_PER_SEC) { + if (!check_tsc_unstable()) { + offset = kvm->arch.last_tsc_offset; + pr_debug("kvm: matched tsc offset for %llu\n", data); + } else { + u64 delta = nsec_to_cycles(elapsed); + offset += delta; + pr_debug("kvm: adjusted tsc offset by %llu\n", delta); + } + ns = kvm->arch.last_tsc_nsec; + } + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = data; + kvm->arch.last_tsc_offset = offset; + kvm_x86_ops->write_tsc_offset(vcpu, offset); + spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + /* Reset of TSC must disable overshoot protection below */ + vcpu->arch.hv_clock.tsc_timestamp = 0; + vcpu->arch.last_tsc_write = data; + vcpu->arch.last_tsc_nsec = ns; +} +EXPORT_SYMBOL_GPL(kvm_write_tsc); + +static int kvm_guest_time_update(struct kvm_vcpu *v) { - struct timespec ts; unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; + s64 kernel_ns, max_kernel_ns; + u64 tsc_timestamp; - if ((!vcpu->time_page)) - return; + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + kernel_ns = get_kernel_ns(); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - this_tsc_khz = get_cpu_var(cpu_tsc_khz); - if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { - kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = this_tsc_khz; + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); + return 1; + } + + /* + * We may have to catch up the TSC to match elapsed wall clock + * time for two reasons, even if kvmclock is used. + * 1) CPU could have been running below the maximum TSC rate + * 2) Broken TSC compensation resets the base at each VCPU + * entry to avoid unknown leaps of TSC even when running + * again on the same CPU. This may cause apparent elapsed + * time to disappear, and the guest to stand still or run + * very slowly. + */ + if (vcpu->tsc_catchup) { + u64 tsc = compute_guest_tsc(v, kernel_ns); + if (tsc > tsc_timestamp) { + kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + tsc_timestamp = tsc; + } } - put_cpu_var(cpu_tsc_khz); - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); local_irq_restore(flags); - /* With all the info we got, fill in the values */ + if (!vcpu->time_page) + return 0; + + /* + * Time as measured by the TSC may go backwards when resetting the base + * tsc_timestamp. The reason for this is that the TSC resolution is + * higher than the resolution of the other clock scales. Thus, many + * possible measurments of the TSC correspond to one measurement of any + * other clock, and so a spread of values is possible. This is not a + * problem for the computation of the nanosecond clock; with TSC rates + * around 1GHZ, there can only be a few cycles which correspond to one + * nanosecond value, and any path through this code will inevitably + * take longer than that. However, with the kernel_ns value itself, + * the precision may be much lower, down to HZ granularity. If the + * first sampling of TSC against kernel_ns ends in the low part of the + * range, and the second in the high end of the range, we can get: + * + * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new + * + * As the sampling errors potentially range in the thousands of cycles, + * it is possible such a time value has already been observed by the + * guest. To protect against this, we must compute the system time as + * observed by the guest and ensure the new system time is greater. + */ + max_kernel_ns = 0; + if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + max_kernel_ns = vcpu->last_guest_tsc - + vcpu->hv_clock.tsc_timestamp; + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, + vcpu->hv_clock.tsc_to_system_mul, + vcpu->hv_clock.tsc_shift); + max_kernel_ns += vcpu->last_kernel_ns; + } + + if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + &vcpu->hv_clock.tsc_shift, + &vcpu->hv_clock.tsc_to_system_mul); + vcpu->hw_tsc_khz = this_tsc_khz; + } - vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + /* With all the info we got, fill in the values */ + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; vcpu->hv_clock.flags = 0; /* @@ -874,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); -} - -static int kvm_request_guest_time_update(struct kvm_vcpu *v) -{ - struct kvm_vcpu_arch *vcpu = &v->arch; - - if (!vcpu->time_page) - return 0; - set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); - return 1; + return 0; } static bool msr_mtrr_valid(unsigned msr) @@ -1209,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } vcpu->arch.time = data; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -1224,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_release_page_clean(vcpu->arch.time_page); vcpu->arch.time_page = NULL; } - - kvm_request_guest_time_update(vcpu); break; } case MSR_IA32_MCG_CTL: @@ -1262,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case MSR_K7_CLK_CTL: + /* + * Ignore all writes to this no longer documented MSR. + * Writes are only relevant for old K7 processors, + * all pre-dating SVM, but a recommended workaround from + * AMD for these chips. It is possible to speicify the + * affected processor models on the command line, hence + * the need to ignore the workaround. + */ + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1454,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case 0xcd: /* fsb frequency */ data = 3; break; + /* + * MSR_EBC_FREQUENCY_ID + * Conservative value valid for even the basic CPU models. + * Models 0,1: 000 in bits 23:21 indicating a bus speed of + * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, + * and 266MHz for model 3, or 4. Set Core Clock + * Frequency to System Bus Frequency Ratio to 1 (bits + * 31:24) even though these are only valid for CPU + * models > 2, however guests may end up dividing or + * multiplying by zero otherwise. + */ + case MSR_EBC_FREQUENCY_ID: + data = 1 << 24; + break; case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; @@ -1487,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case MSR_K7_CLK_CTL: + /* + * Provide expected ramp-up count for K7. All other + * are set to zero, indicating minimum divisors for + * every field. + * + * This prevents guest kernels on AMD host with CPU + * type 6, model 8 and higher from exploding due to + * the rdmsr failing. + */ + data = 0x20000000; + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1524,16 +1836,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, { int i, idx; - vcpu_load(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; srcu_read_unlock(&vcpu->kvm->srcu, idx); - vcpu_put(vcpu); - return i; } @@ -1618,6 +1926,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: + case KVM_CAP_XSAVE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1641,6 +1950,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; + case KVM_CAP_XCRS: + r = cpu_has_xsave; + break; default: r = 0; break; @@ -1717,22 +2029,51 @@ out: return r; } +static void wbinvd_ipi(void *garbage) +{ + wbinvd(); +} + +static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + /* Address WBINVD may be executed by guest */ + if (need_emulate_wbinvd(vcpu)) { + if (kvm_x86_ops->has_wbinvd_exit()) + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); + else if (vcpu->cpu != -1 && vcpu->cpu != cpu) + smp_call_function_single(vcpu->cpu, + wbinvd_ipi, NULL, 1); + } + kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { - unsigned long khz = cpufreq_quick_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + /* Make sure TSC doesn't go backwards */ + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; + if (tsc_delta < 0) + mark_tsc_unstable("KVM discovered backwards TSC"); + if (check_tsc_unstable()) { + kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + vcpu->arch.tsc_catchup = 1; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) + kvm_migrate_timers(vcpu); + vcpu->cpu = cpu; } - kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_put_guest_fpu(vcpu); kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int is_efer_nx(void) @@ -1781,7 +2122,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; - vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1799,7 +2139,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); + update_cpuid(vcpu); out_free: vfree(cpuid_entries); @@ -1820,11 +2160,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; - vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); + update_cpuid(vcpu); return 0; out: @@ -1837,7 +2176,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; - vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1849,7 +2187,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; - vcpu_put(vcpu); return r; } @@ -1901,19 +2238,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = - F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, XSAVE, OSXSAVE */; + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | - 0 /* SKINIT */ | 0 /* WDT */; + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -1922,7 +2260,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xb); + entry->eax = min(entry->eax, (u32)0xd); break; case 1: entry->edx &= kvm_supported_word0_x86_features; @@ -1980,6 +2318,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 0xd: { + int i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (i = 1; *nent < maxnent; ++i) { + if (entry[i - 1].eax == 0 && i != 2) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { char signature[12] = "KVMKVMKVM\0\0"; u32 *sigptr = (u32 *)signature; @@ -2081,9 +2433,7 @@ out: static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - vcpu_put(vcpu); return 0; } @@ -2091,11 +2441,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); update_cr8_intercept(vcpu); - vcpu_put(vcpu); return 0; } @@ -2107,20 +2455,16 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; - vcpu_load(vcpu); kvm_queue_interrupt(vcpu, irq->irq, false); - - vcpu_put(vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); kvm_inject_nmi(vcpu); - vcpu_put(vcpu); return 0; } @@ -2140,7 +2484,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; - vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -2155,7 +2498,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: - vcpu_put(vcpu); return r; } @@ -2188,7 +2530,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } if (banks[1] & MCI_STATUS_VAL) @@ -2213,8 +2555,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - vcpu_load(vcpu); - events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2239,8 +2579,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, @@ -2251,8 +2589,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; - vcpu_load(vcpu); - vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -2275,7 +2611,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; - vcpu_put(vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -2283,14 +2619,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - vcpu_load(vcpu); - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -2299,40 +2631,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; - vcpu_load(vcpu); - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; - vcpu_put(vcpu); + return 0; +} +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + if (cpu_has_xsave) + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->xsave, + xstate_size); + else { + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->fxsave, + sizeof(struct i387_fxsave_struct)); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = + XSTATE_FPSSE; + } +} + +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + u64 xstate_bv = + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + + if (cpu_has_xsave) + memcpy(&vcpu->arch.guest_fpu.state->xsave, + guest_xsave->region, xstate_size); + else { + if (xstate_bv & ~XSTATE_FPSSE) + return -EINVAL; + memcpy(&vcpu->arch.guest_fpu.state->fxsave, + guest_xsave->region, sizeof(struct i387_fxsave_struct)); + } return 0; } +static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + if (!cpu_has_xsave) { + guest_xcrs->nr_xcrs = 0; + return; + } + + guest_xcrs->nr_xcrs = 1; + guest_xcrs->flags = 0; + guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; + guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; +} + +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + int i, r = 0; + + if (!cpu_has_xsave) + return -EINVAL; + + if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + return -EINVAL; + + for (i = 0; i < guest_xcrs->nr_xcrs; i++) + /* Only support XCR0 currently */ + if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, + guest_xcrs->xcrs[0].value); + break; + } + if (r) + r = -EINVAL; + return r; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; - struct kvm_lapic_state *lapic = NULL; + union { + struct kvm_lapic_state *lapic; + struct kvm_xsave *xsave; + struct kvm_xcrs *xcrs; + void *buffer; + } u; + u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; - r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) + if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; @@ -2341,14 +2746,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; r = -EFAULT; - if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) + if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); if (r) goto out; r = 0; @@ -2464,9 +2869,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; - vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - vcpu_put(vcpu); break; } case KVM_GET_VCPU_EVENTS: { @@ -2513,11 +2916,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); break; } + case KVM_GET_XSAVE: { + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!u.xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) + break; + r = 0; + break; + } + case KVM_SET_XSAVE: { + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!u.xsave) + break; + + r = -EFAULT; + if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) + break; + + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); + break; + } + case KVM_GET_XCRS: { + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!u.xcrs) + break; + + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + + r = -EFAULT; + if (copy_to_user(argp, u.xcrs, + sizeof(struct kvm_xcrs))) + break; + r = 0; + break; + } + case KVM_SET_XCRS: { + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!u.xcrs) + break; + + r = -EFAULT; + if (copy_from_user(u.xcrs, argp, + sizeof(struct kvm_xcrs))) + break; + + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); + break; + } default: r = -EINVAL; } out: - kfree(lapic); + kfree(u.buffer); return r; } @@ -2557,116 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { - return kvm->arch.n_alloc_mmu_pages; -} - -gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (alias->flags & KVM_ALIAS_INVALID) - continue; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -/* - * Set a new alias region. Aliases map a portion of physical memory into - * another portion. This is useful for memory windows, for example the PC - * VGA region. - */ -static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, - struct kvm_memory_alias *alias) -{ - int r, n; - struct kvm_mem_alias *p; - struct kvm_mem_aliases *aliases, *old_aliases; - - r = -EINVAL; - /* General sanity checks */ - if (alias->memory_size & (PAGE_SIZE - 1)) - goto out; - if (alias->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; - if (alias->slot >= KVM_ALIAS_SLOTS) - goto out; - if (alias->guest_phys_addr + alias->memory_size - < alias->guest_phys_addr) - goto out; - if (alias->target_phys_addr + alias->memory_size - < alias->target_phys_addr) - goto out; - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out; - - mutex_lock(&kvm->slots_lock); - - /* invalidate any gfn reference in case of deletion/shrinking */ - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kvm_mmu_zap_all(kvm); - kfree(old_aliases); - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out_unlock; - - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - - p = &aliases->aliases[alias->slot]; - p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; - p->npages = alias->memory_size >> PAGE_SHIFT; - p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; - p->flags &= ~(KVM_ALIAS_INVALID); - - for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (aliases->aliases[n - 1].npages) - break; - aliases->naliases = n; - - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_aliases); - r = 0; - -out_unlock: - mutex_unlock(&kvm->slots_lock); -out: - return r; + return kvm->arch.n_max_mmu_pages; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) @@ -2702,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -2797,7 +3147,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; unsigned long n; unsigned long is_dirty = 0; - unsigned long *dirty_bitmap = NULL; mutex_lock(&kvm->slots_lock); @@ -2812,27 +3161,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); - r = -ENOMEM; - dirty_bitmap = vmalloc(n); - if (!dirty_bitmap) - goto out; - memset(dirty_bitmap, 0, n); - for (i = 0; !is_dirty && i < n/sizeof(long); i++) is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { struct kvm_memslots *slots, *old_slots; + unsigned long *dirty_bitmap; spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) - goto out_free; + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) + goto out; + memset(dirty_bitmap, 0, n); + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) { + vfree(dirty_bitmap); + goto out; + } memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; @@ -2841,13 +3193,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, synchronize_srcu_expedited(&kvm->srcu); dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { + vfree(dirty_bitmap); + goto out; + } + vfree(dirty_bitmap); + } else { + r = -EFAULT; + if (clear_user(log->dirty_bitmap, n)) + goto out; } r = 0; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - r = -EFAULT; -out_free: - vfree(dirty_bitmap); out: mutex_unlock(&kvm->slots_lock); return r; @@ -2867,7 +3226,6 @@ long kvm_arch_vm_ioctl(struct file *filp, union { struct kvm_pit_state ps; struct kvm_pit_state2 ps2; - struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -2888,22 +3246,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - kvm_userspace_mem.slot = kvm_mem.slot; - kvm_userspace_mem.flags = kvm_mem.flags; - kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; - kvm_userspace_mem.memory_size = kvm_mem.memory_size; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - break; - } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); if (r) @@ -2912,14 +3254,6 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: - r = -EFAULT; - if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) - goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); - if (r) - goto out; - break; case KVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; @@ -3123,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_SET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; s64 delta; @@ -3137,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + local_irq_disable(); + now_ns = get_kernel_ns(); delta = user_ns.clock - now_ns; + local_irq_enable(); kvm->arch.kvmclock_offset = delta; break; } case KVM_GET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + local_irq_disable(); + now_ns = get_kernel_ns(); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + local_irq_enable(); user_ns.flags = 0; r = -EFAULT; @@ -3214,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + return gpa; +} + +static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + gpa_t t_gpa; + u32 error; + + BUG_ON(!mmu_is_nested(vcpu)); + + /* NPT walks are always user-walks */ + access |= PFERR_USER_MASK; + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); + if (t_gpa == UNMAPPED_GVA) + vcpu->arch.fault.nested = true; + + return t_gpa; +} + gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, @@ -3248,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3259,7 +3615,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, } ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3303,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, - PFERR_WRITE_MASK, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, + PFERR_WRITE_MASK, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3315,7 +3672,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, } ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3330,10 +3687,10 @@ out: static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3343,12 +3700,10 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3370,11 +3725,12 @@ mmio: trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - return X86EMUL_UNHANDLEABLE; + return X86EMUL_IO_NEEDED; } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -3392,17 +3748,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3420,10 +3774,11 @@ mmio: return X86EMUL_CONTINUE; vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; + memcpy(vcpu->run->mmio.data, val, bytes); return X86EMUL_CONTINUE; } @@ -3431,6 +3786,7 @@ mmio: int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ @@ -3438,16 +3794,17 @@ int emulator_write_emulated(unsigned long addr, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, vcpu); + rc = emulator_write_emulated_onepage(addr, val, now, error_code, + vcpu); if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, vcpu); + return emulator_write_emulated_onepage(addr, val, bytes, error_code, + vcpu); } -EXPORT_SYMBOL_GPL(emulator_write_emulated); #define CMPXCHG_TYPE(t, ptr, old, new) \ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) @@ -3463,6 +3820,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; @@ -3484,6 +3842,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr, goto emul_write; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + goto emul_write; + } kaddr = kmap_atomic(page, KM_USER0); kaddr += offset_in_page(gpa); @@ -3516,7 +3878,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, vcpu); + return emulator_write_emulated(addr, new, bytes, error_code, vcpu); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -3541,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, if (vcpu->arch.pio.count) goto data_avail; - trace_kvm_pio(1, port, size, 1); + trace_kvm_pio(0, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 1; @@ -3569,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu) { - trace_kvm_pio(0, port, size, 1); + trace_kvm_pio(1, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 0; @@ -3604,42 +3966,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) return X86EMUL_CONTINUE; } -int emulate_clts(struct kvm_vcpu *vcpu) +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - kvm_x86_ops->fpu_activate(vcpu); + if (!need_emulate_wbinvd(vcpu)) + return X86EMUL_CONTINUE; + + if (kvm_x86_ops->has_wbinvd_exit()) { + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + wbinvd_ipi, NULL, 1); + cpumask_clear(vcpu->arch.wbinvd_dirty_mask); + } + wbinvd(); return X86EMUL_CONTINUE; } +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +int emulate_clts(struct kvm_vcpu *vcpu) { - return kvm_get_dr(ctxt->vcpu, dr, dest); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); + return X86EMUL_CONTINUE; } -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) { - unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - - return kvm_set_dr(ctxt->vcpu, dr, value & mask); + return _kvm_get_dr(vcpu, dr, dest); } -void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) +int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) { - u8 opcodes[4]; - unsigned long rip = kvm_rip_read(vcpu); - unsigned long rip_linear; - if (!printk_ratelimit()) - return; - - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - - printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + return __kvm_set_dr(vcpu, dr, value); } -EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static u64 mk_cr_64(u64 curr_cr, u32 new_val) { @@ -3674,27 +4032,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) { + int res = 0; + switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); break; case 2: vcpu->arch.cr2 = val; break; case 3: - kvm_set_cr3(vcpu, val); + res = kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: - kvm_set_cr8(vcpu, val & 0xfUL); + res = __kvm_set_cr8(vcpu, val & 0xfUL); break; default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + res = -1; } + + return res; } static int emulator_get_cpl(struct kvm_vcpu *vcpu) @@ -3707,6 +4070,17 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) kvm_x86_ops->get_gdt(vcpu, dt); } +static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_idt(vcpu, dt); +} + +static unsigned long emulator_get_cached_segment_base(int seg, + struct kvm_vcpu *vcpu) +{ + return get_segment_base(vcpu, seg); +} + static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, struct kvm_vcpu *vcpu) { @@ -3779,11 +4153,6 @@ static void emulator_set_segment_selector(u16 sel, int seg, kvm_set_segment(vcpu, &kvm_seg, seg); } -static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - kvm_x86_ops->set_rflags(vcpu, rflags); -} - static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -3797,11 +4166,16 @@ static struct x86_emulate_ops emulate_ops = { .set_cached_descriptor = emulator_set_cached_descriptor, .get_segment_selector = emulator_get_segment_selector, .set_segment_selector = emulator_set_segment_selector, + .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, + .get_idt = emulator_get_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, - .set_rflags = emulator_set_rflags, + .get_dr = emulator_get_dr, + .set_dr = emulator_set_dr, + .set_msr = kvm_set_msr, + .get_msr = kvm_get_msr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -3812,14 +4186,126 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = ~0; } +static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + kvm_x86_ops->set_interrupt_shadow(vcpu, mask); +} + +static void inject_emulated_exception(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + if (ctxt->exception == PF_VECTOR) + kvm_propagate_fault(vcpu); + else if (ctxt->error_code_valid) + kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); + else + kvm_queue_exception(vcpu, ctxt->exception); +} + +static void init_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int cs_db, cs_l; + + cache_all_regs(vcpu); + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); +} + +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int ret; + + init_emulate_ctxt(vcpu); + + vcpu->arch.emulate_ctxt.decode.op_bytes = 2; + vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + + if (ret != X86EMUL_CONTINUE) + return EMULATE_FAIL; + + vcpu->arch.emulate_ctxt.eip = c->eip; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (irq == NMI_VECTOR) + vcpu->arch.nmi_pending = false; + else + vcpu->arch.interrupt.pending = false; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); + +static int handle_emulation_failure(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + kvm_queue_exception(vcpu, UD_VECTOR); + return EMULATE_FAIL; +} + +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + + if (tdp_enabled) + return false; + + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + return true; + + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + + if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) + return true; + + return false; +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, int emulation_type) { - int r, shadow_mask; - struct decode_cache *c; - struct kvm_run *run = vcpu->run; + int r; + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -3831,28 +4317,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu, */ cache_all_regs(vcpu); - vcpu->mmio_is_write = 0; - if (!(emulation_type & EMULTYPE_NO_DECODE)) { - int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + init_emulate_ctxt(vcpu); + vcpu->arch.emulate_ctxt.interruptibility = 0; + vcpu->arch.emulate_ctxt.exception = -1; + vcpu->arch.emulate_ctxt.perm_ok = false; + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt); + if (r == X86EMUL_PROPAGATE_FAULT) + goto done; + trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ - c = &vcpu->arch.emulate_ctxt.decode; if (emulation_type & EMULTYPE_TRAP_UD) { if (!c->twobyte) return EMULATE_FAIL; @@ -3880,11 +4358,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_SKIP) + return EMULATE_FAIL; + return handle_emulation_failure(vcpu); } } @@ -3893,52 +4371,44 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } -restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; - - if (r == 0) - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + /* this is needed for vmware backdor interface to work since it + changes registers values during IO operation */ + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); - if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) - vcpu->arch.pio.count = 0; - return EMULATE_DO_MMIO; - } +restart: + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); - if (r || vcpu->mmio_is_write) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr; - memcpy(run->mmio.data, vcpu->mmio_data, 8); - run->mmio.len = vcpu->mmio_size; - run->mmio.is_write = vcpu->mmio_is_write; - } + if (r == EMULATION_FAILED) { + if (reexecute_instruction(vcpu, cr2)) + return EMULATE_DONE; - if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - goto done; - if (!vcpu->mmio_needed) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; - } - return EMULATE_DO_MMIO; - } - - if (vcpu->mmio_is_write) { - vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; + return handle_emulation_failure(vcpu); } done: - if (vcpu->arch.exception.pending) - vcpu->arch.emulate_ctxt.restart = false; - - if (vcpu->arch.emulate_ctxt.restart) + if (vcpu->arch.emulate_ctxt.exception >= 0) { + inject_emulated_exception(vcpu); + r = EMULATE_DONE; + } else if (vcpu->arch.pio.count) { + if (!vcpu->arch.pio.in) + vcpu->arch.pio.count = 0; + r = EMULATE_DO_MMIO; + } else if (vcpu->mmio_needed) { + if (vcpu->mmio_is_write) + vcpu->mmio_needed = 0; + r = EMULATE_DO_MMIO; + } else if (r == EMULATION_RESTART) goto restart; + else + r = EMULATE_DONE; - return EMULATE_DONE; + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + + return r; } EXPORT_SYMBOL_GPL(emulate_instruction); @@ -3952,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void bounce_off(void *info) +static void tsc_bad(void *info) { - /* nothing */ + __get_cpu_var(cpu_tsc_khz) = 0; +} + +static void tsc_khz_changed(void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long khz = 0; + + if (data) + khz = freq->new; + else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + khz = cpufreq_quick_get(raw_smp_processor_id()); + if (!khz) + khz = tsc_khz; + __get_cpu_var(cpu_tsc_khz) = khz; } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -3965,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; + /* + * We allow guests to temporarily run on slowing clocks, + * provided we notify them after, or to run on accelerating + * clocks, provided we notify them before. Thus time never + * goes backwards. + * + * However, we have a problem. We can't atomically update + * the frequency of a given CPU from this function; it is + * merely a notifier, which can be called from any CPU. + * Changing the TSC frequency at arbitrary points in time + * requires a recomputation of local variables related to + * the TSC for each VCPU. We must flag these local variables + * to be updated and be sure the update takes place with the + * new frequency before any guests proceed. + * + * Unfortunately, the combination of hotplug CPU and frequency + * change creates an intractable locking scenario; the order + * of when these callouts happen is undefined with respect to + * CPU hotplug, and they can race with each other. As such, + * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is + * undefined; you can actually have a CPU frequency change take + * place in between the computation of X and the setting of the + * variable. To protect against this problem, all updates of + * the per_cpu tsc_khz variable are done in an interrupt + * protected IPI, and all callers wishing to update the value + * must wait for a synchronous IPI to complete (which is trivial + * if the caller is on the CPU already). This establishes the + * necessary total order on variable updates. + * + * Note that because a guest time update may take place + * anytime after the setting of the VCPU's request bit, the + * correct TSC value must be set before the request. However, + * to ensure the update actually makes it to any guest which + * starts running in hardware virtualization between the set + * and the acquisition of the spinlock, we must also ping the + * CPU after setting the request bit. + * + */ + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - if (!kvm_request_guest_time_update(vcpu)) - continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != smp_processor_id()) - send_ipi++; + send_ipi = 1; } } spin_unlock(&kvm_lock); @@ -3997,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); } return 0; } static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier + .notifier_call = kvmclock_cpufreq_notifier +}; + +static int kvmclock_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, tsc_bad, NULL, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kvmclock_cpu_notifier_block = { + .notifier_call = kvmclock_cpu_notifier, + .priority = -INT_MAX }; static void kvm_timer_init(void) { int cpu; + max_tsc_khz = tsc_khz; + register_hotcpu_notifier(&kvmclock_cpu_notifier_block); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { +#ifdef CONFIG_CPU_FREQ + struct cpufreq_policy policy; + memset(&policy, 0, sizeof(policy)); + cpufreq_get_policy(&policy, get_cpu()); + if (policy.cpuinfo.max_freq) + max_tsc_khz = policy.cpuinfo.max_freq; +#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - for_each_online_cpu(cpu) { - unsigned long khz = cpufreq_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; - } - } else { - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; } + pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -4108,6 +4656,9 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); + if (cpu_has_xsave) + host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + return 0; out: @@ -4121,6 +4672,7 @@ void kvm_arch_exit(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -4270,7 +4822,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, vcpu); + return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) @@ -4506,78 +5058,103 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) } } +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) - kvm_mmu_unload(vcpu); - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - if (vcpu->requests) { - if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) + kvm_mmu_unload(vcpu); + if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) - kvm_write_guest_time(vcpu); - if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { + r = kvm_guest_time_update(vcpu); + if (unlikely(r)) + goto out; + } + if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_x86_ops->tlb_flush(vcpu); - if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, - &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { vcpu->fpu_active = 0; kvm_x86_ops->fpu_deactivate(vcpu); } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); + kvm_load_guest_xcr0(vcpu); - local_irq_disable(); + atomic_set(&vcpu->guest_mode, 1); + smp_wmb(); - clear_bit(KVM_REQ_KICK, &vcpu->requests); - smp_mb__after_clear_bit(); + local_irq_disable(); - if (vcpu->requests || need_resched() || signal_pending(current)) { - set_bit(KVM_REQ_KICK, &vcpu->requests); + if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + || need_resched() || signal_pending(current)) { + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); preempt_enable(); + kvm_x86_ops->cancel_injection(vcpu); r = 1; goto out; } - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -4603,7 +5180,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - set_bit(KVM_REQ_KICK, &vcpu->requests); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); ++vcpu->stat.exits; @@ -4665,7 +5245,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: @@ -4717,8 +5297,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - vcpu_load(vcpu); - if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -4733,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); - if (vcpu->arch.pio.count || vcpu->mmio_needed || - vcpu->arch.emulate_ctxt.restart) { + if (vcpu->arch.pio.count || vcpu->mmio_needed) { if (vcpu->mmio_needed) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; @@ -4743,7 +5320,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r == EMULATE_DO_MMIO) { + if (r != EMULATE_DONE) { r = 0; goto out; } @@ -4759,14 +5336,11 @@ out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - vcpu_put(vcpu); return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -4789,15 +5363,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -4822,7 +5392,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - vcpu_put(vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -4842,8 +5412,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, { struct desc_ptr dt; - vcpu_load(vcpu); - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -4875,55 +5443,43 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); mp_state->mp_state = vcpu->arch.mp_state; - vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); vcpu->arch.mp_state = mp_state->mp_state; - vcpu_put(vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { - int cs_db, cs_l, ret; - cache_all_regs(vcpu); - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int ret; - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + init_emulate_ctxt(vcpu); - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, + ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, tss_selector, reason, has_error_code, error_code); if (ret) return EMULATE_FAIL; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -4935,8 +5491,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits; struct desc_ptr dt; - vcpu_load(vcpu); - dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -4961,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); mmu_reset_needed = 1; } @@ -4996,7 +5550,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - vcpu_put(vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -5007,12 +5561,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; - vcpu_load(vcpu); - if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) - goto unlock_out; + goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else @@ -5054,34 +5606,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; -unlock_out: - vcpu_put(vcpu); +out: return r; } /* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - -/* * Translate a guest virtual address to a guest physical address. */ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -5091,7 +5621,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; - vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -5099,16 +5628,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -5119,16 +5646,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -5139,61 +5663,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } -void fx_init(struct kvm_vcpu *vcpu) +int fx_init(struct kvm_vcpu *vcpu) { - unsigned after_mxcsr_mask; + int err; + + err = fpu_alloc(&vcpu->arch.guest_fpu); + if (err) + return err; + + fpu_finit(&vcpu->arch.guest_fpu); /* - * Touch the fpu the first time in non atomic context as if - * this is the first fpu instruction the exception handler - * will fire before the instruction returns and it'll have to - * allocate ram with GFP_KERNEL. + * Ensure guest xcr0 is valid for loading */ - if (!used_math()) - kvm_fx_save(&vcpu->arch.host_fx_image); - - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); - preempt_enable(); + vcpu->arch.xcr0 = XSTATE_FP; vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); + + return 0; } EXPORT_SYMBOL_GPL(fx_init); +static void fx_free(struct kvm_vcpu *vcpu) +{ + fpu_free(&vcpu->arch.guest_fpu); +} + void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) return; + /* + * Restore all possible states in the guest, + * and assume host would use all available bits. + * Guest xcr0 would be loaded later. + */ + kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_restore(&vcpu->arch.guest_fx_image); + unlazy_fpu(current); + fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + kvm_put_guest_xcr0(vcpu); + if (!vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 0; - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); + fpu_save_init(&vcpu->arch.guest_fpu); ++vcpu->stat.fpu_reload; - set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); } @@ -5204,12 +5730,18 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + printk_once(KERN_WARNING + "kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); return kvm_x86_ops->vcpu_create(kvm, id); } @@ -5217,9 +5749,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); @@ -5241,6 +5770,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5254,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return kvm_x86_ops->vcpu_reset(vcpu); } int kvm_arch_hardware_enable(void *garbage) { - /* - * Since this may be called from a hotplug notifcation, - * we can't get the CPU frequency directly. - */ - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - int cpu = raw_smp_processor_id(); - per_cpu(cpu_tsc_khz, cpu) = 0; - } + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; kvm_shared_msr_cpu_online(); - + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu->cpu == smp_processor_id()) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return kvm_x86_ops->hardware_enable(garbage); } @@ -5303,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.emulate_ctxt.ops = &emulate_ops; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.translate_gpa = translate_gpa; + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -5316,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); + if (!kvm->arch.virtual_tsc_khz) + kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -5334,7 +5871,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) + goto fail_free_mce_banks; + return 0; +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: @@ -5364,19 +5906,13 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); - kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!kvm->arch.aliases) { - kfree(kvm); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - rdtscll(kvm->arch.vm_init_tsc); + spin_lock_init(&kvm->arch.tsc_write_lock); return kvm; } @@ -5412,12 +5948,12 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); + kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); - kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); @@ -5427,7 +5963,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); cleanup_srcu_struct(&kvm->srcu); - kfree(kvm->arch.aliases); kfree(kvm); } @@ -5438,6 +5973,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, int user_alloc) { int npages = memslot->npages; + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + + /* Prevent internal slot pages from being moved by fork()/COW. */ + if (memslot->id >= KVM_MEMORY_SLOTS) + map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5450,7 +5990,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, + map_flags, 0); up_write(¤t->mm->mmap_sem); @@ -5523,7 +6063,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + if (atomic_xchg(&vcpu->guest_mode, 0)) smp_send_reschedule(cpu); put_cpu(); } @@ -5559,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); |