summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx/nested.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
-rw-r--r--arch/x86/kvm/vmx/nested.c778
1 files changed, 489 insertions, 289 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 46af3a5e9209..ced9fba32598 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -41,15 +41,19 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
-static u16 shadow_read_only_fields[] = {
-#define SHADOW_FIELD_RO(x) x,
+struct shadow_vmcs_field {
+ u16 encoding;
+ u16 offset;
+};
+static struct shadow_vmcs_field shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
#include "vmcs_shadow_fields.h"
};
static int max_shadow_read_only_fields =
ARRAY_SIZE(shadow_read_only_fields);
-static u16 shadow_read_write_fields[] = {
-#define SHADOW_FIELD_RW(x) x,
+static struct shadow_vmcs_field shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
#include "vmcs_shadow_fields.h"
};
static int max_shadow_read_write_fields =
@@ -63,34 +67,40 @@ static void init_vmcs_shadow_fields(void)
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
for (i = j = 0; i < max_shadow_read_only_fields; i++) {
- u16 field = shadow_read_only_fields[i];
+ struct shadow_vmcs_field entry = shadow_read_only_fields[i];
+ u16 field = entry.encoding;
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
(i + 1 == max_shadow_read_only_fields ||
- shadow_read_only_fields[i + 1] != field + 1))
+ shadow_read_only_fields[i + 1].encoding != field + 1))
pr_err("Missing field from shadow_read_only_field %x\n",
field + 1);
clear_bit(field, vmx_vmread_bitmap);
-#ifdef CONFIG_X86_64
if (field & 1)
+#ifdef CONFIG_X86_64
continue;
+#else
+ entry.offset += sizeof(u32);
#endif
- if (j < i)
- shadow_read_only_fields[j] = field;
- j++;
+ shadow_read_only_fields[j++] = entry;
}
max_shadow_read_only_fields = j;
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
- u16 field = shadow_read_write_fields[i];
+ struct shadow_vmcs_field entry = shadow_read_write_fields[i];
+ u16 field = entry.encoding;
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
(i + 1 == max_shadow_read_write_fields ||
- shadow_read_write_fields[i + 1] != field + 1))
+ shadow_read_write_fields[i + 1].encoding != field + 1))
pr_err("Missing field from shadow_read_write_field %x\n",
field + 1);
+ WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
+ field <= GUEST_TR_AR_BYTES,
+ "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
+
/*
* PML and the preemption timer can be emulated, but the
* processor cannot vmwrite to fields that don't exist
@@ -115,13 +125,13 @@ static void init_vmcs_shadow_fields(void)
clear_bit(field, vmx_vmwrite_bitmap);
clear_bit(field, vmx_vmread_bitmap);
-#ifdef CONFIG_X86_64
if (field & 1)
+#ifdef CONFIG_X86_64
continue;
+#else
+ entry.offset += sizeof(u32);
#endif
- if (j < i)
- shadow_read_write_fields[j] = field;
- j++;
+ shadow_read_write_fields[j++] = entry;
}
max_shadow_read_write_fields = j;
}
@@ -182,8 +192,9 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
}
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
@@ -209,6 +220,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
+ kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
free_vpid(vmx->nested.vpid02);
@@ -221,7 +234,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
+ vmx->nested.cached_vmcs12 = NULL;
kfree(vmx->nested.cached_shadow_vmcs12);
+ vmx->nested.cached_shadow_vmcs12 = NULL;
/* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -238,22 +253,41 @@ static void free_nested(struct kvm_vcpu *vcpu)
free_loaded_vmcs(&vmx->nested.vmcs02);
}
+static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
+ struct loaded_vmcs *prev)
+{
+ struct vmcs_host_state *dest, *src;
+
+ if (unlikely(!vmx->guest_state_loaded))
+ return;
+
+ src = &prev->host_state;
+ dest = &vmx->loaded_vmcs->host_state;
+
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ dest->ldt_sel = src->ldt_sel;
+#ifdef CONFIG_X86_64
+ dest->ds_sel = src->ds_sel;
+ dest->es_sel = src->es_sel;
+#endif
+}
+
static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *prev;
int cpu;
if (vmx->loaded_vmcs == vmcs)
return;
cpu = get_cpu();
- vmx_vcpu_put(vcpu);
+ prev = vmx->loaded_vmcs;
vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load(vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
+ vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vm_entry_controls_reset_shadow(vmx);
- vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
}
@@ -930,8 +964,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* If PAE paging and EPT are both on, CR3 is not used by the CPU and
* must not be dereferenced.
*/
- if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
- !nested_ept) {
+ if (is_pae_paging(vcpu) && !nested_ept) {
if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
return -EINVAL;
@@ -1105,14 +1138,6 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
vmx->nested.msrs.misc_low = data;
vmx->nested.msrs.misc_high = data >> 32;
- /*
- * If L1 has read-only VM-exit information fields, use the
- * less permissive vmx_vmwrite_bitmap to specify write
- * permissions for the shadow VMCS.
- */
- if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
- vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
-
return 0;
}
@@ -1214,6 +1239,11 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_IA32_VMX_VMCS_ENUM:
vmx->nested.msrs.vmcs_enum = data;
return 0;
+ case MSR_IA32_VMX_VMFUNC:
+ if (data & ~vmx->nested.msrs.vmfunc_controls)
+ return -EINVAL;
+ vmx->nested.msrs.vmfunc_controls = data;
+ return 0;
default:
/*
* The rest of the VMX capability MSRs do not support restore.
@@ -1301,41 +1331,32 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
}
/*
- * Copy the writable VMCS shadow fields back to the VMCS12, in case
- * they have been modified by the L1 guest. Note that the "read-only"
- * VM-exit information fields are actually writable if the vCPU is
- * configured to support "VMWRITE to any supported field in the VMCS."
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
+ * been modified by the L1 guest. Note, "writable" in this context means
+ * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
+ * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
+ * VM-exit information fields (which are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS").
*/
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
{
- const u16 *fields[] = {
- shadow_read_write_fields,
- shadow_read_only_fields
- };
- const int max_fields[] = {
- max_shadow_read_write_fields,
- max_shadow_read_only_fields
- };
- int i, q;
- unsigned long field;
- u64 field_value;
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
preempt_disable();
vmcs_load(shadow_vmcs);
- for (q = 0; q < ARRAY_SIZE(fields); q++) {
- for (i = 0; i < max_fields[q]; i++) {
- field = fields[q][i];
- field_value = __vmcs_readl(field);
- vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
- }
- /*
- * Skip the VM-exit information fields if they are read-only.
- */
- if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
- break;
+ for (i = 0; i < max_shadow_read_write_fields; i++) {
+ field = shadow_read_write_fields[i];
+ val = __vmcs_readl(field.encoding);
+ vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
}
vmcs_clear(shadow_vmcs);
@@ -1346,7 +1367,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
{
- const u16 *fields[] = {
+ const struct shadow_vmcs_field *fields[] = {
shadow_read_write_fields,
shadow_read_only_fields
};
@@ -1354,18 +1375,23 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
max_shadow_read_write_fields,
max_shadow_read_only_fields
};
- int i, q;
- unsigned long field;
- u64 field_value = 0;
struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+ struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
+ struct shadow_vmcs_field field;
+ unsigned long val;
+ int i, q;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
vmcs_load(shadow_vmcs);
for (q = 0; q < ARRAY_SIZE(fields); q++) {
for (i = 0; i < max_fields[q]; i++) {
field = fields[q][i];
- vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
- __vmcs_writel(field, field_value);
+ val = vmcs12_read_any(vmcs12, field.encoding,
+ field.offset);
+ __vmcs_writel(field.encoding, val);
}
}
@@ -1623,7 +1649,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
* evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
* evmcs->host_idtr_base = vmcs12->host_idtr_base;
* evmcs->host_rsp = vmcs12->host_rsp;
- * sync_vmcs12() doesn't read these:
+ * sync_vmcs02_to_vmcs12() doesn't read these:
* evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
* evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
* evmcs->msr_bitmap = vmcs12->msr_bitmap;
@@ -1768,26 +1794,22 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
bool from_launch)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct hv_vp_assist_page assist_page;
+ bool evmcs_gpa_changed = false;
+ u64 evmcs_gpa;
if (likely(!vmx->nested.enlightened_vmcs_enabled))
return 1;
- if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
+ if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
return 1;
- if (unlikely(!assist_page.enlighten_vmentry))
- return 1;
-
- if (unlikely(assist_page.current_nested_vmcs !=
- vmx->nested.hv_evmcs_vmptr)) {
-
+ if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
if (!vmx->nested.hv_evmcs)
vmx->nested.current_vmptr = -1ull;
nested_release_evmcs(vcpu);
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs),
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
&vmx->nested.hv_evmcs_map))
return 0;
@@ -1822,15 +1844,9 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
}
vmx->nested.dirty_vmcs12 = true;
- /*
- * As we keep L2 state for one guest only 'hv_clean_fields' mask
- * can't be used when we switch between them. Reset it here for
- * simplicity.
- */
- vmx->nested.hv_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
+ vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
+ evmcs_gpa_changed = true;
/*
* Unlike normal vmcs12, enlightened vmcs12 is not fully
* reloaded from guest's memory (read only fields, fields not
@@ -1844,10 +1860,19 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
}
}
+
+ /*
+ * Clean fields data can't de used on VMLAUNCH and when we switch
+ * between different L2 guests as KVM keeps a single VMCS12 per L1.
+ */
+ if (from_launch || evmcs_gpa_changed)
+ vmx->nested.hv_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
return 1;
}
-void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
+void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1868,7 +1893,7 @@ void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
copy_vmcs12_to_shadow(vmx);
}
- vmx->nested.need_vmcs12_sync = false;
+ vmx->nested.need_vmcs12_to_shadow_sync = false;
}
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -1948,8 +1973,20 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
- if (enable_pml)
+ /*
+ * The PML address never changes, so it is constant in vmcs02.
+ * Conceptually we want to copy the PML index from vmcs01 here,
+ * and then back to vmcs01 on nested vmexit. But since we flush
+ * the log and reset GUEST_PML_INDEX on each vmexit, the PML
+ * index is also effectively constant in vmcs02.
+ */
+ if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
/*
* Set the MSR load/store lists to match L0's settings. Only the
@@ -1963,7 +2000,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
vmx_set_constant_host_state(vmx);
}
-static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
+static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
struct vmcs12 *vmcs12)
{
prepare_vmcs02_constant_state(vmx);
@@ -1984,17 +2021,14 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
- prepare_vmcs02_early_full(vmx, vmcs12);
+ prepare_vmcs02_early_rare(vmx, vmcs12);
/*
* PIN CONTROLS
*/
- exec_control = vmcs12->pin_based_vm_exec_control;
-
- /* Preemption timer setting is computed directly in vmx_vcpu_run. */
- exec_control |= vmcs_config.pin_based_exec_ctrl;
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
- vmx->loaded_vmcs->hv_timer_armed = false;
+ exec_control = vmx_pin_based_exec_ctrl(vmx);
+ exec_control |= (vmcs12->pin_based_vm_exec_control &
+ ~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -2003,7 +2037,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
} else {
exec_control &= ~PIN_BASED_POSTED_INTR;
}
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
+ pin_controls_set(vmx, exec_control);
/*
* EXEC CONTROLS
@@ -2014,28 +2048,31 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
- /*
- * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
- * nested_get_vmcs12_pages can't fix it up, the illegal value
- * will result in a VM entry failure.
- */
- if (exec_control & CPU_BASED_TPR_SHADOW) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ if (exec_control & CPU_BASED_TPR_SHADOW)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
- } else {
#ifdef CONFIG_X86_64
+ else
exec_control |= CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING;
#endif
- }
/*
* A vmexit (to either L1 hypervisor or L0 userspace) is always needed
* for I/O port accesses.
*/
- exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+
+ /*
+ * This bit will be computed in nested_get_vmcs12_pages, because
+ * we do not have access to L1's MSR bitmap yet. For now, keep
+ * the same bit as before, hoping to avoid multiple VMWRITEs that
+ * only set/clear this bit.
+ */
+ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+ exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
+
+ exec_controls_set(vmx, exec_control);
/*
* SECONDARY EXEC CONTROLS
@@ -2061,22 +2098,19 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
/* VMCS shadowing for L2 is emulated for now */
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
- vmcs_write16(GUEST_INTR_STATUS,
- vmcs12->guest_intr_status);
-
/*
- * Write an illegal value to APIC_ACCESS_ADDR. Later,
- * nested_get_vmcs12_pages will either fix it up or
- * remove the VM execution control.
+ * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
+ * will not have to rewrite the controls just for this bit.
*/
- if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
- vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+ if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
+ (vmcs12->guest_cr4 & X86_CR4_UMIP))
+ exec_control |= SECONDARY_EXEC_DESC;
- if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ secondary_exec_controls_set(vmx, exec_control);
}
/*
@@ -2095,7 +2129,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (guest_efer != host_efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
- vm_entry_controls_init(vmx, exec_control);
+ vm_entry_controls_set(vmx, exec_control);
/*
* EXIT CONTROLS
@@ -2107,17 +2141,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
exec_control = vmx_vmexit_ctrl();
if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
- vm_exit_controls_init(vmx, exec_control);
-
- /*
- * Conceptually we want to copy the PML address and index from
- * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
- * since we always flush the log on each vmexit and never change
- * the PML address (once set), this happens to be equivalent to
- * simply resetting the index in vmcs02.
- */
- if (enable_pml)
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vm_exit_controls_set(vmx, exec_control);
/*
* Interrupt/Exception Fields
@@ -2138,7 +2162,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
}
}
-static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{
struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
@@ -2162,6 +2186,8 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
@@ -2198,6 +2224,10 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
+
+ if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
}
if (nested_cpu_has_xsaves(vmcs12))
@@ -2233,14 +2263,6 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
set_cr4_guest_host_mask(vmx);
-
- if (kvm_mpx_supported()) {
- if (vmx->nested.nested_run_pending &&
- (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
- vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
- else
- vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
- }
}
/*
@@ -2259,20 +2281,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+ bool load_guest_pdptrs_vmcs12 = false;
- if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
- prepare_vmcs02_full(vmx, vmcs12);
+ if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
+ prepare_vmcs02_rare(vmx, vmcs12);
vmx->nested.dirty_vmcs12 = false;
- }
- /*
- * First, the fields that are shadowed. This must be kept in sync
- * with vmcs_shadow_fields.h.
- */
- if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
- vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
- vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ load_guest_pdptrs_vmcs12 = !hv_evmcs ||
+ !(hv_evmcs->hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
if (vmx->nested.nested_run_pending &&
@@ -2283,6 +2300,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
}
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -2372,6 +2392,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
entry_failure_code))
return -EINVAL;
+ /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
+ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
+ is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
@@ -2609,6 +2638,30 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
!kvm_pat_valid(vmcs12->host_ia32_pat))
return -EINVAL;
+ ia32e = (vmcs12->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+
+ if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
+ vmcs12->host_cs_selector == 0 ||
+ vmcs12->host_tr_selector == 0 ||
+ (vmcs12->host_ss_selector == 0 && !ia32e))
+ return -EINVAL;
+
+#ifdef CONFIG_X86_64
+ if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_gs_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_idtr_base, vcpu) ||
+ is_noncanonical_address(vmcs12->host_tr_base, vcpu))
+ return -EINVAL;
+#endif
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2616,8 +2669,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
* the host address-space size VM-exit control.
*/
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
- ia32e = (vmcs12->vm_exit_controls &
- VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
@@ -2781,7 +2832,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
[launched]"i"(offsetof(struct loaded_vmcs, launched)),
[host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
[wordsize]"i"(sizeof(ulong))
- : "cc", "memory"
+ : "memory"
);
if (vmx->msr_autoload.host.nr)
@@ -2851,18 +2902,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
hpa = page_to_phys(vmx->nested.apic_access_page);
vmcs_write64(APIC_ACCESS_ADDR, hpa);
} else {
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ secondary_exec_controls_clearbit(vmx,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
}
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
map = &vmx->nested.virtual_apic_map;
- /*
- * If translation failed, VM entry will fail because
- * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
- */
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
@@ -2876,11 +2923,13 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* _not_ what the processor does but it's basically the
* only possibility we have.
*/
- vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_TPR_SHADOW);
+ exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
} else {
- printk("bad virtual-APIC page address\n");
- dump_vmcs();
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
+ * force VM-Entry to fail.
+ */
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
}
}
@@ -2896,11 +2945,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
}
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
- vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_USE_MSR_BITMAPS);
+ exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
else
- vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_USE_MSR_BITMAPS);
+ exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
}
/*
@@ -2953,7 +3000,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
u32 exit_reason = EXIT_REASON_INVALID_STATE;
u32 exit_qual;
- evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
@@ -2964,6 +3011,25 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+ /*
+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
+ * nested early checks are disabled. In the event of a "late" VM-Fail,
+ * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
+ * software model to the pre-VMEntry host state. When EPT is disabled,
+ * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
+ * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
+ * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
+ * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
+ * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
+ * guaranteed to be overwritten with a shadow CR3 prior to re-entering
+ * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
+ * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
+ * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
+ * path would need to manually save/restore vmcs01.GUEST_CR3.
+ */
+ if (!enable_ept && !nested_early_check)
+ vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
+
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
prepare_vmcs02_early(vmx, vmcs12);
@@ -3059,7 +3125,7 @@ vmentry_fail_vmexit:
vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
vmcs12->exit_qualification = exit_qual;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
return 1;
}
@@ -3077,7 +3143,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
+ if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
return 1;
if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
@@ -3393,20 +3459,57 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
}
-/*
- * Update the guest state fields of vmcs12 to reflect changes that
- * occurred while L2 was running. (The "IA-32e mode guest" bit of the
- * VM-entry controls is also updated, since this is really a guest
- * state bit.)
- */
-static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-{
- vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
- vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+static bool is_vmcs12_ext_field(unsigned long field)
+{
+ switch (field) {
+ case GUEST_ES_SELECTOR:
+ case GUEST_CS_SELECTOR:
+ case GUEST_SS_SELECTOR:
+ case GUEST_DS_SELECTOR:
+ case GUEST_FS_SELECTOR:
+ case GUEST_GS_SELECTOR:
+ case GUEST_LDTR_SELECTOR:
+ case GUEST_TR_SELECTOR:
+ case GUEST_ES_LIMIT:
+ case GUEST_CS_LIMIT:
+ case GUEST_SS_LIMIT:
+ case GUEST_DS_LIMIT:
+ case GUEST_FS_LIMIT:
+ case GUEST_GS_LIMIT:
+ case GUEST_LDTR_LIMIT:
+ case GUEST_TR_LIMIT:
+ case GUEST_GDTR_LIMIT:
+ case GUEST_IDTR_LIMIT:
+ case GUEST_ES_AR_BYTES:
+ case GUEST_DS_AR_BYTES:
+ case GUEST_FS_AR_BYTES:
+ case GUEST_GS_AR_BYTES:
+ case GUEST_LDTR_AR_BYTES:
+ case GUEST_TR_AR_BYTES:
+ case GUEST_ES_BASE:
+ case GUEST_CS_BASE:
+ case GUEST_SS_BASE:
+ case GUEST_DS_BASE:
+ case GUEST_FS_BASE:
+ case GUEST_GS_BASE:
+ case GUEST_LDTR_BASE:
+ case GUEST_TR_BASE:
+ case GUEST_GDTR_BASE:
+ case GUEST_IDTR_BASE:
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ case GUEST_BNDCFGS:
+ return true;
+ default:
+ break;
+ }
- vmcs12->guest_rsp = kvm_rsp_read(vcpu);
- vmcs12->guest_rip = kvm_rip_read(vcpu);
- vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+ return false;
+}
+
+static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
@@ -3427,8 +3530,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
- vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
- vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
@@ -3444,11 +3545,69 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ if (kvm_mpx_supported())
+ vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
+}
+
+static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
+ return;
+
+
+ WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->nested.vmcs02;
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+ put_cpu();
+}
+
+/*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmx->nested.hv_evmcs)
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
+
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+ vmcs12->guest_rip = kvm_rip_read(vcpu);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+
+ vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+ vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
vmcs12->guest_interruptibility_info =
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- vmcs12->guest_pending_dbg_exceptions =
- vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
else
@@ -3469,10 +3628,12 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
*/
if (enable_ept) {
vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
- vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
- vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
- vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
- vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
}
vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
@@ -3484,22 +3645,11 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
- if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
- vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- }
- /* TODO: These cannot have changed unless we have MSR bitmaps and
- * the relevant bit asks not to trap the change */
- if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
- vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
vmcs12->guest_ia32_efer = vcpu->arch.efer;
- vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
- vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
- vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
- if (kvm_mpx_supported())
- vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
}
/*
@@ -3517,11 +3667,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
u32 exit_reason, u32 exit_intr_info,
unsigned long exit_qualification)
{
- /* update guest state fields: */
- sync_vmcs12(vcpu, vmcs12);
-
/* update exit information fields: */
-
vmcs12->vm_exit_reason = exit_reason;
vmcs12->exit_qualification = exit_qualification;
vmcs12->vm_exit_intr_info = exit_intr_info;
@@ -3775,18 +3921,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
nested_ept_uninit_mmu_context(vcpu);
-
- /*
- * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
- * points to shadow pages! Fortunately we only get here after a WARN_ON
- * if EPT is disabled, so a VMabort is perfectly fine.
- */
- if (enable_ept) {
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
- } else {
- nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
- }
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -3794,7 +3930,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
* VMFail, like everything else we just need to ensure our
* software model is up-to-date.
*/
- ept_save_pdptrs(vcpu);
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -3882,14 +4019,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
if (likely(!vmx->fail)) {
- if (exit_reason == -1)
- sync_vmcs12(vcpu, vmcs12);
- else
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+
+ if (exit_reason != -1)
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
/*
- * Must happen outside of sync_vmcs12() as it will
+ * Must happen outside of sync_vmcs02_to_vmcs12() as it will
* also be used to capture vmcs12 cache as part of
* capturing nVMX state for snapshot (migration).
*
@@ -3945,7 +4082,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4008,7 +4145,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
* #UD or #GP.
*/
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
- u32 vmx_instruction_info, bool wr, gva_t *ret)
+ u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
{
gva_t off;
bool exn;
@@ -4068,7 +4205,10 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
* mode, e.g. a 32-bit address size can yield a 64-bit virtual
* address when using FS/GS with a non-zero base.
*/
- *ret = s.base + off;
+ if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
+ *ret = s.base + off;
+ else
+ *ret = off;
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
* non-canonical form. This is the only check on the memory
@@ -4115,7 +4255,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
*/
if (!(s.base == 0 && s.limit == 0xffffffff &&
((s.type & 8) || !(s.type & 4))))
- exn = exn || (off + sizeof(u64) > s.limit);
+ exn = exn || ((u64)off + len - 1 > s.limit);
}
if (exn) {
kvm_queue_exception_e(vcpu,
@@ -4134,7 +4274,8 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
struct x86_exception e;
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
+ vmcs_read32(VMX_INSTRUCTION_INFO), false,
+ sizeof(*vmpointer), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
@@ -4300,11 +4441,12 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
if (vmx->nested.current_vmptr == -1ull)
return;
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+
if (enable_shadow_vmcs) {
/* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
- vmx->nested.need_vmcs12_sync = false;
vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
@@ -4334,6 +4476,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 zero = 0;
gpa_t vmptr;
+ u64 evmcs_gpa;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -4349,10 +4492,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
- if (vmx->nested.hv_evmcs_map.hva) {
- if (vmptr == vmx->nested.hv_evmcs_vmptr)
- nested_release_evmcs(vcpu);
- } else {
+ /*
+ * When Enlightened VMEntry is enabled on the calling CPU we treat
+ * memory area pointer by vmptr as Enlightened VMCS (as there's no good
+ * way to distinguish it from VMCS12) and we must not corrupt it by
+ * writing to the non-existent 'launch_state' field. The area doesn't
+ * have to be the currently active EVMCS on the calling CPU and there's
+ * nothing KVM has to do to transition it from 'active' to 'non-active'
+ * state. It is possible that the area will stay mapped as
+ * vmx->nested.hv_evmcs but this shouldn't be a problem.
+ */
+ if (likely(!vmx->nested.enlightened_vmcs_enabled ||
+ !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
@@ -4386,8 +4537,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
u64 field_value;
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ int len;
gva_t gva = 0;
struct vmcs12 *vmcs12;
+ short offset;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -4409,11 +4562,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
- /* Read the field, zero-extended to a u64 field_value */
- if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0)
return nested_vmx_failValid(vcpu,
VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+ /* Read the field, zero-extended to a u64 field_value */
+ field_value = vmcs12_read_any(vmcs12, field, offset);
+
/*
* Now copy part of this value to register or memory, as requested.
* Note that the number of bits actually copied is 32 or 64 depending
@@ -4423,21 +4583,45 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
field_value);
} else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, true, &gva))
+ vmx_instruction_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- kvm_write_guest_virt_system(vcpu, gva, &field_value,
- (is_long_mode(vcpu) ? 8 : 4), NULL);
+ kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL);
}
return nested_vmx_succeed(vcpu);
}
+static bool is_shadow_field_rw(unsigned long field)
+{
+ switch (field) {
+#define SHADOW_FIELD_RW(x, y) case x:
+#include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static bool is_shadow_field_ro(unsigned long field)
+{
+ switch (field) {
+#define SHADOW_FIELD_RO(x, y) case x:
+#include "vmcs_shadow_fields.h"
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
static int handle_vmwrite(struct kvm_vcpu *vcpu)
{
unsigned long field;
+ int len;
gva_t gva;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4452,6 +4636,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
u64 field_value = 0;
struct x86_exception e;
struct vmcs12 *vmcs12;
+ short offset;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -4463,11 +4648,11 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
field_value = kvm_register_readl(vcpu,
(((vmx_instruction_info) >> 3) & 0xf));
else {
+ len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, false, &gva))
+ vmx_instruction_info, false, len, &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &field_value,
- (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -4484,9 +4669,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- if (!is_guest_mode(vcpu))
+ if (!is_guest_mode(vcpu)) {
vmcs12 = get_vmcs12(vcpu);
- else {
+
+ /*
+ * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+ * vmcs12, else we may crush a field or consume a stale value.
+ */
+ if (!is_shadow_field_rw(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ } else {
/*
* When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
* to shadowed-field sets the ALU flags for VMfailInvalid.
@@ -4496,28 +4688,46 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
vmcs12 = get_shadow_vmcs12(vcpu);
}
- if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0)
return nested_vmx_failValid(vcpu,
VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/*
- * Do not track vmcs12 dirty-state if in guest-mode
- * as we actually dirty shadow vmcs12 instead of vmcs12.
+ * Some Intel CPUs intentionally drop the reserved bits of the AR byte
+ * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
+ * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
+ * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
+ * from L1 will return a different value than VMREAD from L2 (L1 sees
+ * the stripped down value, L2 sees the full value as stored by KVM).
*/
- if (!is_guest_mode(vcpu)) {
- switch (field) {
-#define SHADOW_FIELD_RW(x) case x:
-#include "vmcs_shadow_fields.h"
- /*
- * The fields that can be updated by L1 without a vmexit are
- * always updated in the vmcs02, the others go down the slow
- * path of prepare_vmcs02.
- */
- break;
- default:
- vmx->nested.dirty_vmcs12 = true;
- break;
+ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
+ field_value &= 0x1f0ff;
+
+ vmcs12_write_any(vmcs12, field, offset, field_value);
+
+ /*
+ * Do not track vmcs12 dirty-state if in guest-mode as we actually
+ * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
+ * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
+ * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
+ /*
+ * L1 can read these fields without exiting, ensure the
+ * shadow VMCS is up-to-date.
+ */
+ if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
+ preempt_disable();
+ vmcs_load(vmx->vmcs01.shadow_vmcs);
+
+ __vmcs_writel(field, field_value);
+
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ preempt_enable();
}
+ vmx->nested.dirty_vmcs12 = true;
}
return nested_vmx_succeed(vcpu);
@@ -4527,11 +4737,10 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
{
vmx->nested.current_vmptr = vmptr;
if (enable_shadow_vmcs) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
__pa(vmx->vmcs01.shadow_vmcs));
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
}
vmx->nested.dirty_vmcs12 = true;
}
@@ -4615,7 +4824,8 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
return 1;
- if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
+ true, sizeof(gpa_t), &gva))
return 1;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
@@ -4661,7 +4871,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
* operand is read even if it isn't needed (e.g., for type==global)
*/
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmx_instruction_info, false, &gva))
+ vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
@@ -4670,13 +4880,11 @@ static int handle_invept(struct kvm_vcpu *vcpu)
switch (type) {
case VMX_EPT_EXTENT_GLOBAL:
+ case VMX_EPT_EXTENT_CONTEXT:
/*
- * TODO: track mappings and invalidate
- * single context requests appropriately
+ * TODO: Sync the necessary shadow EPT roots here, rather than
+ * at the next emulated VM-entry.
*/
- case VMX_EPT_EXTENT_CONTEXT:
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
break;
default:
BUG_ON(1);
@@ -4723,7 +4931,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* operand is read even if it isn't needed (e.g., for type==global)
*/
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmx_instruction_info, false, &gva))
+ vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
kvm_inject_page_fault(vcpu, &e);
@@ -5284,12 +5492,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
* When running L2, the authoritative vmcs12 state is in the
* vmcs02. When running L1, the authoritative vmcs12 state is
* in the shadow or enlightened vmcs linked to vmcs01, unless
- * need_vmcs12_sync is set, in which case, the authoritative
+ * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
* vmcs12 state is in the vmcs12 already.
*/
if (is_guest_mode(vcpu)) {
- sync_vmcs12(vcpu, vmcs12);
- } else if (!vmx->nested.need_vmcs12_sync) {
+ sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+ sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
if (vmx->nested.hv_evmcs)
copy_enlightened_to_vmcs12(vmx);
else if (enable_shadow_vmcs)
@@ -5421,7 +5630,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
* Sync eVMCS upon entry as we may not have
* HV_X64_MSR_VP_ASSIST_PAGE set up yet.
*/
- vmx->nested.need_vmcs12_sync = true;
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
} else {
return -EINVAL;
}
@@ -5489,14 +5698,8 @@ error_guest_mode:
void nested_vmx_vcpu_setup(void)
{
if (enable_shadow_vmcs) {
- /*
- * At vCPU creation, "VMWRITE to any supported field
- * in the VMCS" is supported, so use the more
- * permissive vmx_vmread_bitmap to specify both read
- * and write permissions for the shadow VMCS.
- */
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
- vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
}
}
@@ -5626,10 +5829,15 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_WBINVD_EXITING;
+ SECONDARY_EXEC_RDRAND_EXITING |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_XSAVES;
/*
* We can emulate "VMCS shadowing," even if the hardware
@@ -5749,14 +5957,6 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
{
int i;
- /*
- * Without EPT it is not possible to restore L1's CR3 and PDPTR on
- * VMfail, because they are not available in vmcs01. Just always
- * use hardware checks.
- */
- if (!enable_ept)
- nested_early_check = 1;
-
if (!cpu_has_vmx_shadow_vmcs())
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {