From c10a8de0d32e95b0b8c7c17b6dc09baea5a5a899 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 19 Oct 2018 10:04:18 -0700 Subject: perf/x86/intel/uncore: Add more IMC PCI IDs for KabyLake and CoffeeLake CPUs KabyLake and CoffeeLake CPUs have the same client uncore events as SkyLake. Add the PCI IDs for the KabyLake Y, U, S processor lines and CoffeeLake U, H, S processor lines. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20181019170419.378-1-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snb.c | 115 ++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 8527c3e1038b..bfa25814fe5f 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -15,6 +15,25 @@ #define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 #define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f #define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f +#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c +#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904 +#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914 +#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f +#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f +#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc +#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0 +#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10 +#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4 +#define PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC 0x3e0f +#define PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC 0x3e1f +#define PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC 0x3ec2 +#define PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC 0x3e30 +#define PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC 0x3e18 +#define PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC 0x3ec6 +#define PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC 0x3e31 +#define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33 +#define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca +#define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -569,7 +588,82 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, - + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -618,6 +712,25 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(SKL_HQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Quad Core */ IMC_DEV(SKL_SD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Dual Core */ IMC_DEV(SKL_SQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Quad Core */ + IMC_DEV(KBL_Y_IMC, &skl_uncore_pci_driver), /* 7th Gen Core Y */ + IMC_DEV(KBL_U_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U */ + IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */ + IMC_DEV(KBL_SD_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Dual Core */ + IMC_DEV(KBL_SQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Quad Core */ + IMC_DEV(CFL_2U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 2 Cores */ + IMC_DEV(CFL_4U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 4 Cores */ + IMC_DEV(CFL_4H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 4 Cores */ + IMC_DEV(CFL_6H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 6 Cores */ + IMC_DEV(CFL_2S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 2 Cores Desktop */ + IMC_DEV(CFL_4S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Desktop */ + IMC_DEV(CFL_6S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Desktop */ + IMC_DEV(CFL_8S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Desktop */ + IMC_DEV(CFL_4S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Work Station */ + IMC_DEV(CFL_6S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Work Station */ + IMC_DEV(CFL_8S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Work Station */ + IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */ + IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */ + IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */ { /* end marker */ } }; -- cgit v1.2.3-55-g7522 From 4d47d6407ac7b4b442a4e717488a3bb137398b6c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 19 Oct 2018 10:04:19 -0700 Subject: perf/x86/intel/uncore: Support CoffeeLake 8th CBOX Coffee Lake has 8 core products which has 8 Cboxes. The 8th CBOX is mapped into different MSR space. Increase the num_boxes to 8 to handle the new products. It will not impact the previous platforms, SkyLake, KabyLake and earlier CoffeeLake. Because the num_boxes will be recalculated in uncore_cpu_init and doesn't exceed the x86_max_cores. Introduce a new box flag bit to indicate the 8th CBOX. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20181019170419.378-2-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore.h | 33 +++++++++++++++++++++++++-------- arch/x86/events/intel/uncore_snb.c | 6 +++++- 2 files changed, 30 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index e17ab885b1e9..cb46d602a6b8 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -129,8 +129,15 @@ struct intel_uncore_box { struct intel_uncore_extra_reg shared_regs[0]; }; -#define UNCORE_BOX_FLAG_INITIATED 0 -#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */ +/* CFL uncore 8th cbox MSRs */ +#define CFL_UNC_CBO_7_PERFEVTSEL0 0xf70 +#define CFL_UNC_CBO_7_PER_CTR0 0xf76 + +#define UNCORE_BOX_FLAG_INITIATED 0 +/* event config registers are 8-byte apart */ +#define UNCORE_BOX_FLAG_CTL_OFFS8 1 +/* CFL 8th CBOX has different MSR space */ +#define UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS 2 struct uncore_event_desc { struct kobj_attribute attr; @@ -297,17 +304,27 @@ unsigned int uncore_freerunning_counter(struct intel_uncore_box *box, static inline unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) { - return box->pmu->type->event_ctl + - (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - uncore_msr_box_offset(box); + if (test_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags)) { + return CFL_UNC_CBO_7_PERFEVTSEL0 + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx); + } else { + return box->pmu->type->event_ctl + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); + } } static inline unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) { - return box->pmu->type->perf_ctr + - (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + - uncore_msr_box_offset(box); + if (test_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags)) { + return CFL_UNC_CBO_7_PER_CTR0 + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx); + } else { + return box->pmu->type->perf_ctr + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); + } } static inline diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index bfa25814fe5f..2593b0d7aeee 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -221,6 +221,10 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box) wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); } + + /* The 8th CBOX has different MSR space */ + if (box->pmu->pmu_idx == 7) + __set_bit(UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS, &box->flags); } static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) @@ -247,7 +251,7 @@ static struct intel_uncore_ops skl_uncore_msr_ops = { static struct intel_uncore_type skl_uncore_cbox = { .name = "cbox", .num_counters = 4, - .num_boxes = 5, + .num_boxes = 8, .perf_ctr_bits = 44, .fixed_ctr_bits = 48, .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, -- cgit v1.2.3-55-g7522 From 3841840449817ba6cf3e636008bc4e1061a03388 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 20 Nov 2018 08:25:28 +0100 Subject: x86/boot: Mostly revert commit ae7e1238e68f2a ("Add ACPI RSDP address to setup_header") Peter Anvin pointed out that commit: ae7e1238e68f2a ("x86/boot: Add ACPI RSDP address to setup_header") should be reverted as setup_header should only contain items set by the legacy BIOS. So revert said commit. Instead of fully reverting the dependent commit of: e7b66d16fe4172 ("x86/acpi, x86/boot: Take RSDP address for boot params if available") just remove the setup_header reference in order to replace it by a boot_params in a followup patch. Suggested-by: "H. Peter Anvin" Signed-off-by: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: bp@alien8.de Cc: daniel.kiper@oracle.com Cc: sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20181120072529.5489-2-jgross@suse.com Signed-off-by: Ingo Molnar --- Documentation/x86/boot.txt | 32 +------------------------------- arch/x86/boot/header.S | 6 +----- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/include/uapi/asm/bootparam.h | 4 ---- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/head32.c | 1 - arch/x86/kernel/head64.c | 2 -- arch/x86/kernel/setup.c | 17 ----------------- 8 files changed, 3 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 7727db8f94bc..5e9b826b5f62 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -61,18 +61,6 @@ Protocol 2.12: (Kernel 3.8) Added the xloadflags field and extension fields to struct boot_params for loading bzImage and ramdisk above 4G in 64bit. -Protocol 2.13: (Kernel 3.14) Support 32- and 64-bit flags being set in - xloadflags to support booting a 64-bit kernel from 32-bit - EFI - -Protocol 2.14: (Kernel 4.20) Added acpi_rsdp_addr holding the physical - address of the ACPI RSDP table. - The bootloader updates version with: - 0x8000 | min(kernel-version, bootloader-version) - kernel-version being the protocol version supported by - the kernel and bootloader-version the protocol version - supported by the bootloader. - **** MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -209,7 +197,6 @@ Offset Proto Name Meaning 0258/8 2.10+ pref_address Preferred loading address 0260/4 2.10+ init_size Linear memory required during initialization 0264/4 2.11+ handover_offset Offset of handover entry point -0268/8 2.14+ acpi_rsdp_addr Physical address of RSDP table (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -322,7 +309,7 @@ Protocol: 2.00+ Contains the magic number "HdrS" (0x53726448). Field name: version -Type: modify +Type: read Offset/size: 0x206/2 Protocol: 2.00+ @@ -330,12 +317,6 @@ Protocol: 2.00+ e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version 10.17. - Up to protocol version 2.13 this information is only read by the - bootloader. From protocol version 2.14 onwards the bootloader will - write the used protocol version or-ed with 0x8000 to the field. The - used protocol version will be the minimum of the supported protocol - versions of the bootloader and the kernel. - Field name: realmode_swtch Type: modify (optional) Offset/size: 0x208/4 @@ -763,17 +744,6 @@ Offset/size: 0x264/4 See EFI HANDOVER PROTOCOL below for more details. -Field name: acpi_rsdp_addr -Type: write -Offset/size: 0x268/8 -Protocol: 2.14+ - - This field can be set by the boot loader to tell the kernel the - physical address of the ACPI RSDP table. - - A value of 0 indicates the kernel should fall back to the standard - methods to locate the RSDP. - **** THE IMAGE CHECKSUM diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 4c881c850125..850b8762e889 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -300,7 +300,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020e # header version number (>= 0x0105) + .word 0x020d # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -558,10 +558,6 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c -acpi_rsdp_addr: .quad 0 # 64-bit physical pointer to the - # ACPI RSDP table, added with - # version 2.14 - # End of setup header ##################################################### .section ".entrytext", "ax" diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 0f842104862c..b85a7c54c6a1 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -303,6 +303,4 @@ extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); extern bool x86_pnpbios_disabled(void); -void x86_verify_bootdata_version(void); - #endif diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 22f89d040ddd..a06cbf019744 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -16,9 +16,6 @@ #define RAMDISK_PROMPT_FLAG 0x8000 #define RAMDISK_LOAD_FLAG 0x4000 -/* version flags */ -#define VERSION_WRITTEN 0x8000 - /* loadflags */ #define LOADED_HIGH (1<<0) #define KASLR_FLAG (1<<1) @@ -89,7 +86,6 @@ struct setup_header { __u64 pref_address; __u32 init_size; __u32 handover_offset; - __u64 acpi_rsdp_addr; } __attribute__((packed)); struct sys_desc_table { diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 92c76bf97ad8..fb3b1f3a5aba 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1776,5 +1776,5 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) u64 x86_default_get_root_pointer(void) { - return boot_params.hdr.acpi_rsdp_addr; + return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 76fa3b836598..ec6fefbfd3c0 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -37,7 +37,6 @@ asmlinkage __visible void __init i386_start_kernel(void) cr4_init_shadow(); sanitize_boot_params(&boot_params); - x86_verify_bootdata_version(); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7663a8eb602b..16b1cbd3a61e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -457,8 +457,6 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); - x86_verify_bootdata_version(); - x86_early_init_platform_quirks(); switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b74e7bfed6ab..d494b9bfe618 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1280,23 +1280,6 @@ void __init setup_arch(char **cmdline_p) unwind_init(); } -/* - * From boot protocol 2.14 onwards we expect the bootloader to set the - * version to "0x8000 | ". In case we find a version >= 2.14 - * without the 0x8000 we assume the boot loader supports 2.13 only and - * reset the version accordingly. The 0x8000 flag is removed in any case. - */ -void __init x86_verify_bootdata_version(void) -{ - if (boot_params.hdr.version & VERSION_WRITTEN) - boot_params.hdr.version &= ~VERSION_WRITTEN; - else if (boot_params.hdr.version >= 0x020e) - boot_params.hdr.version = 0x020d; - - if (boot_params.hdr.version < 0x020e) - boot_params.hdr.acpi_rsdp_addr = 0; -} - #ifdef CONFIG_X86_32 static struct resource video_ram_resource = { -- cgit v1.2.3-55-g7522 From e6e094e053af75cbc164e950814d3d084fb1e698 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 20 Nov 2018 08:25:29 +0100 Subject: x86/acpi, x86/boot: Take RSDP address from boot params if available In case the RSDP address in struct boot_params is specified don't try to find the table by searching, but take the address directly as set by the boot loader. Suggested-by: "H. Peter Anvin" Signed-off-by: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: bp@alien8.de Cc: daniel.kiper@oracle.com Cc: sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20181120072529.5489-3-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/bootparam.h | 3 ++- arch/x86/kernel/acpi/boot.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index a06cbf019744..60733f137e9a 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -155,7 +155,8 @@ struct boot_params { __u8 _pad2[4]; /* 0x054 */ __u64 tboot_addr; /* 0x058 */ struct ist_info ist_info; /* 0x060 */ - __u8 _pad3[16]; /* 0x070 */ + __u64 acpi_rsdp_addr; /* 0x070 */ + __u8 _pad3[8]; /* 0x078 */ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ struct sys_desc_table sys_desc_table; /* obsolete! */ /* 0x0a0 */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb3b1f3a5aba..06635fbca81c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1776,5 +1776,5 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) u64 x86_default_get_root_pointer(void) { - return 0; + return boot_params.acpi_rsdp_addr; } -- cgit v1.2.3-55-g7522 From 68239654acafe6aad5a3c1dc7237e60accfebc03 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 20 Nov 2018 11:26:35 +0100 Subject: x86/fpu: Disable bottom halves while loading FPU registers The sequence fpu->initialized = 1; /* step A */ preempt_disable(); /* step B */ fpu__restore(fpu); preempt_enable(); in __fpu__restore_sig() is racy in regard to a context switch. For 32bit frames, __fpu__restore_sig() prepares the FPU state within fpu->state. To ensure that a context switch (switch_fpu_prepare() in particular) does not modify fpu->state it uses fpu__drop() which sets fpu->initialized to 0. After fpu->initialized is cleared, the CPU's FPU state is not saved to fpu->state during a context switch. The new state is loaded via fpu__restore(). It gets loaded into fpu->state from userland and ensured it is sane. fpu->initialized is then set to 1 in order to avoid fpu__initialize() doing anything (overwrite the new state) which is part of fpu__restore(). A context switch between step A and B above would save CPU's current FPU registers to fpu->state and overwrite the newly prepared state. This looks like a tiny race window but the Kernel Test Robot reported this back in 2016 while we had lazy FPU support. Borislav Petkov made the link between that report and another patch that has been posted. Since the removal of the lazy FPU support, this race goes unnoticed because the warning has been removed. Disable bottom halves around the restore sequence to avoid the race. BH need to be disabled because BH is allowed to run (even with preemption disabled) and might invoke kernel_fpu_begin() by doing IPsec. [ bp: massage commit message a bit. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Borislav Petkov Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: "Jason A. Donenfeld" Cc: kvm ML Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Rik van Riel Cc: stable@vger.kernel.org Cc: x86-ml Link: http://lkml.kernel.org/r/20181120102635.ddv3fvavxajjlfqk@linutronix.de Link: https://lkml.kernel.org/r/20160226074940.GA28911@pd.tnic --- arch/x86/kernel/fpu/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 61a949d84dfa..d99a8ee9e185 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -344,10 +344,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } + local_bh_disable(); fpu->initialized = 1; - preempt_disable(); fpu__restore(fpu); - preempt_enable(); + local_bh_enable(); return err; } else { -- cgit v1.2.3-55-g7522 From 2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Nov 2018 18:08:42 +0100 Subject: perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt handling Kyle Huey reported that 'rr', a replay debugger, broke due to the following commit: af3bdb991a5c ("perf/x86/intel: Add a separate Arch Perfmon v4 PMI handler") Rework the 'disable_counter_freezing' __setup() parameter such that we can explicitly enable/disable it and switch to default disabled. To this purpose, rename the parameter to "perf_v4_pmi=" which is a much better description and allows requiring a bool argument. [ mingo: Improved the changelog some more. ] Reported-by: Kyle Huey Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert O'Callahan Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Link: http://lkml.kernel.org/r/20181120170842.GZ2131@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- arch/x86/events/intel/core.c | 12 ++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a74728..5463d5a4d85c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -856,7 +856,8 @@ causing system reset or hang due to sending INIT from AP to BSP. - disable_counter_freezing [HW] + perf_v4_pmi= [X86,INTEL] + Format: Disable Intel PMU counter freezing feature. The feature only exists starting from Arch Perfmon v4 (Skylake and newer). diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 273c62e81546..af8bea9d4006 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2306,14 +2306,18 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) return handled; } -static bool disable_counter_freezing; +static bool disable_counter_freezing = true; static int __init intel_perf_counter_freezing_setup(char *s) { - disable_counter_freezing = true; - pr_info("Intel PMU Counter freezing feature disabled\n"); + bool res; + + if (kstrtobool(s, &res)) + return -EINVAL; + + disable_counter_freezing = !res; return 1; } -__setup("disable_counter_freezing", intel_perf_counter_freezing_setup); +__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup); /* * Simplified handler for Arch Perfmon v4: -- cgit v1.2.3-55-g7522 From ed6101bbf6266ee83e620b19faa7c6ad56bb41ab Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 21 Nov 2018 11:16:10 +0100 Subject: perf/x86/intel: Move branch tracing setup to the Intel-specific source file Moving branch tracing setup to Intel core object into separate intel_pmu_bts_config function, because it's Intel specific. Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20181121101612.16272-1-jolsa@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 20 -------------------- arch/x86/events/intel/core.c | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 106911b603bd..374a19712e20 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -438,26 +438,6 @@ int x86_setup_perfctr(struct perf_event *event) if (config == -1LL) return -EINVAL; - /* - * Branch tracing: - */ - if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && - !attr->freq && hwc->sample_period == 1) { - /* BTS is not supported by this architecture. */ - if (!x86_pmu.bts_active) - return -EOPNOTSUPP; - - /* BTS is currently only allowed for user-mode. */ - if (!attr->exclude_kernel) - return -EOPNOTSUPP; - - /* disallow bts if conflicting events are present */ - if (x86_add_exclusive(x86_lbr_exclusive_lbr)) - return -EBUSY; - - event->destroy = hw_perf_lbr_event_destroy; - } - hwc->config |= config; return 0; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index af8bea9d4006..a95079abeb03 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3102,10 +3102,49 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) return flags; } +static int intel_pmu_bts_config(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { + /* BTS is not supported by this architecture. */ + if (!x86_pmu.bts_active) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (!attr->exclude_kernel) + return -EOPNOTSUPP; + + /* disallow bts if conflicting events are present */ + if (x86_add_exclusive(x86_lbr_exclusive_lbr)) + return -EBUSY; + + event->destroy = hw_perf_lbr_event_destroy; + } + + return 0; +} + +static int core_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + return intel_pmu_bts_config(event); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); + if (ret) + return ret; + + ret = intel_pmu_bts_config(event); if (ret) return ret; @@ -3600,7 +3639,7 @@ static __initconst const struct x86_pmu core_pmu = { .enable_all = core_pmu_enable_all, .enable = core_pmu_enable_event, .disable = x86_pmu_disable_event, - .hw_config = x86_pmu_hw_config, + .hw_config = core_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, -- cgit v1.2.3-55-g7522 From 67266c1080ad56c31af72b9c18355fde8ccc124a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 21 Nov 2018 11:16:11 +0100 Subject: perf/x86/intel: Add generic branch tracing check to intel_pmu_has_bts() Currently we check the branch tracing only by checking for the PERF_COUNT_HW_BRANCH_INSTRUCTIONS event of PERF_TYPE_HARDWARE type. But we can define the same event with the PERF_TYPE_RAW type. Changing the intel_pmu_has_bts() code to check on event's final hw config value, so both HW types are covered. Adding unlikely to intel_pmu_has_bts() condition calls, because it was used in the original code in intel_bts_constraints. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20181121101612.16272-2-jolsa@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 17 +++-------------- arch/x86/events/perf_event.h | 13 +++++++++---- 2 files changed, 12 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a95079abeb03..a15a73788693 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2474,16 +2474,7 @@ done: static struct event_constraint * intel_bts_constraints(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; - unsigned int hw_event, bts_event; - - if (event->attr.freq) - return NULL; - - hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; - bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); - - if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) + if (unlikely(intel_pmu_has_bts(event))) return &bts_constraint; return NULL; @@ -3105,10 +3096,8 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) static int intel_pmu_bts_config(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; - struct hw_perf_event *hwc = &event->hw; - if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && - !attr->freq && hwc->sample_period == 1) { + if (unlikely(intel_pmu_has_bts(event))) { /* BTS is not supported by this architecture. */ if (!x86_pmu.bts_active) return -EOPNOTSUPP; @@ -3170,7 +3159,7 @@ static int intel_pmu_hw_config(struct perf_event *event) /* * BTS is set up earlier in this path, so don't account twice */ - if (!intel_pmu_has_bts(event)) { + if (!unlikely(intel_pmu_has_bts(event))) { /* disallow lbr if conflicting events are present */ if (x86_add_exclusive(x86_lbr_exclusive_lbr)) return -EBUSY; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index adae087cecdd..78d7b7031bfc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -859,11 +859,16 @@ static inline int amd_pmu_init(void) static inline bool intel_pmu_has_bts(struct perf_event *event) { - if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && - !event->attr.freq && event->hw.sample_period == 1) - return true; + struct hw_perf_event *hwc = &event->hw; + unsigned int hw_event, bts_event; + + if (event->attr.freq) + return false; + + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; + bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); - return false; + return hw_event == bts_event && hwc->sample_period == 1; } int intel_pmu_save_and_restart(struct perf_event *event); -- cgit v1.2.3-55-g7522 From 472de49fdc53365c880ab81ae2b5cfdd83db0b06 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 21 Nov 2018 11:16:12 +0100 Subject: perf/x86/intel: Disallow precise_ip on BTS events Vince reported a crash in the BTS flush code when touching the callchain data, which was supposed to be initialized as an 'early' callchain, but intel_pmu_drain_bts_buffer() does not do that: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 ... Call Trace: intel_pmu_drain_bts_buffer+0x151/0x220 ? intel_get_event_constraints+0x219/0x360 ? perf_assign_events+0xe2/0x2a0 ? select_idle_sibling+0x22/0x3a0 ? __update_load_avg_se+0x1ec/0x270 ? enqueue_task_fair+0x377/0xdd0 ? cpumask_next_and+0x19/0x20 ? load_balance+0x134/0x950 ? check_preempt_curr+0x7a/0x90 ? ttwu_do_wakeup+0x19/0x140 x86_pmu_stop+0x3b/0x90 x86_pmu_del+0x57/0x160 event_sched_out.isra.106+0x81/0x170 group_sched_out.part.108+0x51/0xc0 __perf_event_disable+0x7f/0x160 event_function+0x8c/0xd0 remote_function+0x3c/0x50 flush_smp_call_function_queue+0x35/0xe0 smp_call_function_single_interrupt+0x3a/0xd0 call_function_single_interrupt+0xf/0x20 It was triggered by fuzzer but can be easily reproduced by: # perf record -e cpu/branch-instructions/pu -g -c 1 Peter suggested not to allow branch tracing for precise events: > Now arguably, this is really stupid behaviour. Who in his right mind > wants callchain output on BTS entries. And even if they do, BTS + > precise_ip is nonsensical. > > So in my mind disallowing precise_ip on BTS would be the simplest fix. Suggested-by: Peter Zijlstra Reported-by: Vince Weaver Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Fixes: 6cbc304f2f36 ("perf/x86/intel: Fix unwind errors from PEBS entries (mk-II)") Link: http://lkml.kernel.org/r/20181121101612.16272-3-jolsa@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a15a73788693..ecc3e34ca955 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3106,6 +3106,10 @@ static int intel_pmu_bts_config(struct perf_event *event) if (!attr->exclude_kernel) return -EOPNOTSUPP; + /* BTS is not allowed for precise events. */ + if (attr->precise_ip) + return -EOPNOTSUPP; + /* disallow bts if conflicting events are present */ if (x86_add_exclusive(x86_lbr_exclusive_lbr)) return -EBUSY; -- cgit v1.2.3-55-g7522 From f2a5fef1248beccacec0deecb67c1be693d72ae6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 19 Nov 2018 14:59:45 +0100 Subject: x86/xen: cleanup includes in arch/x86/xen/spinlock.c arch/x86/xen/spinlock.c includes several headers which are not needed. Remove the #includes. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- arch/x86/xen/spinlock.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 1c8a8816a402..3776122c87cc 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -3,22 +3,17 @@ * Split spinlock implementation out into its own file, so it can be * compiled in a FTRACE-compatible way. */ -#include +#include #include -#include -#include -#include #include #include #include #include -#include #include #include "xen-ops.h" -#include "debugfs.h" static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); -- cgit v1.2.3-55-g7522 From a87c99e61236ba8ca962ce97a19fab5ebd588d35 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Fri, 23 Nov 2018 12:02:14 -0500 Subject: KVM: VMX: re-add ple_gap module parameter Apparently, the ple_gap parameter was accidentally removed by commit c8e88717cfc6b36bedea22368d97667446318291. Add it back. Signed-off-by: Luiz Capitulino Cc: stable@vger.kernel.org Fixes: c8e88717cfc6b36bedea22368d97667446318291 Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4555077d69ce..be6f13f1c25f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -174,6 +174,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); * refer SDM volume 3b section 21.6.13 & 22.1.3. */ static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; +module_param(ple_gap, uint, 0444); static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, uint, 0444); -- cgit v1.2.3-55-g7522 From 38ab012f109caf10f471db1adf284e620dd8d701 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 20 Nov 2018 09:39:30 +0800 Subject: KVM: LAPIC: Fix pv ipis use-before-initialization Reported by syzkaller: BUG: unable to handle kernel NULL pointer dereference at 0000000000000014 PGD 800000040410c067 P4D 800000040410c067 PUD 40410d067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 2567 Comm: poc Tainted: G OE 4.19.0-rc5 #16 RIP: 0010:kvm_pv_send_ipi+0x94/0x350 [kvm] Call Trace: kvm_emulate_hypercall+0x3cc/0x700 [kvm] handle_vmcall+0xe/0x10 [kvm_intel] vmx_handle_exit+0xc1/0x11b0 [kvm_intel] vcpu_enter_guest+0x9fb/0x1910 [kvm] kvm_arch_vcpu_ioctl_run+0x35c/0x610 [kvm] kvm_vcpu_ioctl+0x3e9/0x6d0 [kvm] do_vfs_ioctl+0xa5/0x690 ksys_ioctl+0x6d/0x80 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x83/0x6e0 entry_SYSCALL_64_after_hwframe+0x49/0xbe The reason is that the apic map has not yet been initialized, the testcase triggers pv_send_ipi interface by vmcall which results in kvm->arch.apic_map is dereferenced. This patch fixes it by checking whether or not apic map is NULL and bailing out immediately if that is the case. Fixes: 4180bf1b65 (KVM: X86: Implement "send IPI" hypercall) Reported-by: Wei Wu Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Wei Wu Signed-off-by: Wanpeng Li Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 89db20f8cb70..02f2291dcf7e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -576,6 +576,11 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (unlikely(!map)) { + count = -EOPNOTSUPP; + goto out; + } + if (min > map->max_apic_id) goto out; /* Bits above cluster_size are masked in the caller. */ -- cgit v1.2.3-55-g7522 From e97f852fd4561e77721bb9a4e0ea9d98305b1e93 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 20 Nov 2018 16:34:18 +0800 Subject: KVM: X86: Fix scan ioapic use-before-initialization Reported by syzkaller: BUG: unable to handle kernel NULL pointer dereference at 00000000000001c8 PGD 80000003ec4da067 P4D 80000003ec4da067 PUD 3f7bfa067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 7 PID: 5059 Comm: debug Tainted: G OE 4.19.0-rc5 #16 RIP: 0010:__lock_acquire+0x1a6/0x1990 Call Trace: lock_acquire+0xdb/0x210 _raw_spin_lock+0x38/0x70 kvm_ioapic_scan_entry+0x3e/0x110 [kvm] vcpu_enter_guest+0x167e/0x1910 [kvm] kvm_arch_vcpu_ioctl_run+0x35c/0x610 [kvm] kvm_vcpu_ioctl+0x3e9/0x6d0 [kvm] do_vfs_ioctl+0xa5/0x690 ksys_ioctl+0x6d/0x80 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x83/0x6e0 entry_SYSCALL_64_after_hwframe+0x49/0xbe The reason is that the testcase writes hyperv synic HV_X64_MSR_SINT6 msr and triggers scan ioapic logic to load synic vectors into EOI exit bitmap. However, irqchip is not initialized by this simple testcase, ioapic/apic objects should not be accessed. This can be triggered by the following program: #define _GNU_SOURCE #include #include #include #include #include #include #include #include uint64_t r[3] = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}; int main(void) { syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0); long res = 0; memcpy((void*)0x20000040, "/dev/kvm", 9); res = syscall(__NR_openat, 0xffffffffffffff9c, 0x20000040, 0, 0); if (res != -1) r[0] = res; res = syscall(__NR_ioctl, r[0], 0xae01, 0); if (res != -1) r[1] = res; res = syscall(__NR_ioctl, r[1], 0xae41, 0); if (res != -1) r[2] = res; memcpy( (void*)0x20000080, "\x01\x00\x00\x00\x00\x5b\x61\xbb\x96\x00\x00\x40\x00\x00\x00\x00\x01\x00" "\x08\x00\x00\x00\x00\x00\x0b\x77\xd1\x78\x4d\xd8\x3a\xed\xb1\x5c\x2e\x43" "\xaa\x43\x39\xd6\xff\xf5\xf0\xa8\x98\xf2\x3e\x37\x29\x89\xde\x88\xc6\x33" "\xfc\x2a\xdb\xb7\xe1\x4c\xac\x28\x61\x7b\x9c\xa9\xbc\x0d\xa0\x63\xfe\xfe" "\xe8\x75\xde\xdd\x19\x38\xdc\x34\xf5\xec\x05\xfd\xeb\x5d\xed\x2e\xaf\x22" "\xfa\xab\xb7\xe4\x42\x67\xd0\xaf\x06\x1c\x6a\x35\x67\x10\x55\xcb", 106); syscall(__NR_ioctl, r[2], 0x4008ae89, 0x20000080); syscall(__NR_ioctl, r[2], 0xae80, 0); return 0; } This patch fixes it by bailing out scan ioapic if ioapic is not initialized in kernel. Reported-by: Wei Wu Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Wei Wu Signed-off-by: Wanpeng Li Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5cd5647120f2..64cae03b2c20 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7455,7 +7455,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) else { if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } if (is_guest_mode(vcpu)) -- cgit v1.2.3-55-g7522 From 30510387a5e45bfcf8190e03ec7aa15b295828e2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 12 Nov 2018 12:23:14 +0000 Subject: svm: Add mutex_lock to protect apic_access_page_done on AMD systems There is a race condition when accessing kvm->arch.apic_access_page_done. Due to it, x86_set_memory_region will fail when creating the second vcpu for a svm guest. Add a mutex_lock to serialize the accesses to apic_access_page_done. This lock is also used by vmx for the same purpose. Signed-off-by: Wei Wang Signed-off-by: Amadeusz Juskowiak Signed-off-by: Julian Stecklina Signed-off-by: Suravee Suthikulpanit Reviewed-by: Joerg Roedel Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0e21ccc46792..033ec01661b9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1664,20 +1664,23 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, static int avic_init_access_page(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int ret; + int ret = 0; + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) - return 0; + goto out; - ret = x86_set_memory_region(kvm, - APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); + ret = __x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + PAGE_SIZE); if (ret) - return ret; + goto out; kvm->arch.apic_access_page_done = true; - return 0; +out: + mutex_unlock(&kvm->slots_lock); + return ret; } static int avic_init_backing_page(struct kvm_vcpu *vcpu) -- cgit v1.2.3-55-g7522 From 7f9ad1dfa3c768d1116c2dbacd7a09f9a871534e Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 17 Nov 2018 14:05:06 +0200 Subject: KVM: nVMX: Fix kernel info-leak when enabling KVM_CAP_HYPERV_ENLIGHTENED_VMCS more than once Consider the case that userspace enables KVM_CAP_HYPERV_ENLIGHTENED_VMCS twice: 1) kvm_vcpu_ioctl_enable_cap() is called to enable KVM_CAP_HYPERV_ENLIGHTENED_VMCS which calls nested_enable_evmcs(). 2) nested_enable_evmcs() sets enlightened_vmcs_enabled to true and fills vmcs_version which is then copied to userspace. 3) kvm_vcpu_ioctl_enable_cap() is called again to enable KVM_CAP_HYPERV_ENLIGHTENED_VMCS which calls nested_enable_evmcs(). 4) This time nested_enable_evmcs() just returns 0 as enlightened_vmcs_enabled is already true. *Without filling vmcs_version*. 5) kvm_vcpu_ioctl_enable_cap() continues as usual and copies *uninitialized* vmcs_version to userspace which leads to kernel info-leak. Fix this issue by simply changing nested_enable_evmcs() to always fill vmcs_version output argument. Even when enlightened_vmcs_enabled is already set to true. Note that SVM's nested_enable_evmcs() should not be modified because it always returns a non-zero value (-ENODEV) which results in kvm_vcpu_ioctl_enable_cap() skipping the copy of vmcs_version to userspace (as it should). Fixes: 57b119da3594 ("KVM: nVMX: add KVM_CAP_HYPERV_ENLIGHTENED_VMCS capability") Reported-by: syzbot+cfbc368e283d381f8cef@syzkaller.appspotmail.com Reviewed-by: Krish Sadhukhan Reviewed-by: Vitaly Kuznetsov Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index be6f13f1c25f..065f1df74000 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1611,12 +1611,6 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* We don't support disabling the feature for simplicity. */ - if (vmx->nested.enlightened_vmcs_enabled) - return 0; - - vmx->nested.enlightened_vmcs_enabled = true; - /* * vmcs_version represents the range of supported Enlightened VMCS * versions: lower 8 bits is the minimal version, higher 8 bits is the @@ -1626,6 +1620,12 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, if (vmcs_version) *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; + /* We don't support disabling the feature for simplicity. */ + if (vmx->nested.enlightened_vmcs_enabled) + return 0; + + vmx->nested.enlightened_vmcs_enabled = true; + vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; -- cgit v1.2.3-55-g7522 From bcbfbd8ec21096027f1ee13ce6c185e8175166f6 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Thu, 8 Nov 2018 00:43:06 +0200 Subject: KVM: x86: Fix kernel info-leak in KVM_HC_CLOCK_PAIRING hypercall kvm_pv_clock_pairing() allocates local var "struct kvm_clock_pairing clock_pairing" on stack and initializes all it's fields besides padding (clock_pairing.pad[]). Because clock_pairing var is written completely (including padding) to guest memory, failure to init struct padding results in kernel info-leak. Fix the issue by making sure to also init the padding with zeroes. Fixes: 55dd00a73a51 ("KVM: x86: add KVM_HC_CLOCK_PAIRING hypercall") Reported-by: syzbot+a8ef68d71211ba264f56@syzkaller.appspotmail.com Reviewed-by: Mark Kanda Signed-off-by: Liran Alon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64cae03b2c20..7e4be1f2f253 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6918,6 +6918,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, clock_pairing.nsec = ts.tv_nsec; clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); clock_pairing.flags = 0; + memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad)); ret = 0; if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, -- cgit v1.2.3-55-g7522 From f48b4711dd6e1cf282f9dfd159c14a305909c97c Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Tue, 20 Nov 2018 18:03:25 +0200 Subject: KVM: VMX: Update shared MSRs to be saved/restored on MSR_EFER.LMA changes When guest transitions from/to long-mode by modifying MSR_EFER.LMA, the list of shared MSRs to be saved/restored on guest<->host transitions is updated (See vmx_set_efer() call to setup_msrs()). On every entry to guest, vcpu_enter_guest() calls vmx_prepare_switch_to_guest(). This function should also take care of setting the shared MSRs to be saved/restored. However, the function does nothing in case we are already running with loaded guest state (vmx->loaded_cpu_state != NULL). This means that even when guest modifies MSR_EFER.LMA which results in updating the list of shared MSRs, it isn't being taken into account by vmx_prepare_switch_to_guest() because it happens while we are running with loaded guest state. To fix above mentioned issue, add a flag to mark that the list of shared MSRs has been updated and modify vmx_prepare_switch_to_guest() to set shared MSRs when running with host state *OR* list of shared MSRs has been updated. Note that this issue was mistakenly introduced by commit 678e315e78a7 ("KVM: vmx: add dedicated utility to access guest's kernel_gs_base") because previously vmx_set_efer() always called vmx_load_host_state() which resulted in vmx_prepare_switch_to_guest() to set shared MSRs. Fixes: 678e315e78a7 ("KVM: vmx: add dedicated utility to access guest's kernel_gs_base") Reported-by: Eyal Moscovici Reviewed-by: Mihai Carabas Reviewed-by: Liam Merwick Reviewed-by: Jim Mattson Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 065f1df74000..d09d67310012 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -985,6 +985,7 @@ struct vcpu_vmx { struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; + bool guest_msrs_dirty; unsigned long host_idt_base; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; @@ -2898,6 +2899,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->req_immediate_exit = false; + /* + * Note that guest MSRs to be saved/restored can also be changed + * when guest state is loaded. This happens when guest transitions + * to/from long-mode by setting MSR_EFER.LMA. + */ + if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { + vmx->guest_msrs_dirty = false; + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); + + } + if (vmx->loaded_cpu_state) return; @@ -2958,11 +2973,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmcs_writel(HOST_GS_BASE, gs_base); host_state->gs_base = gs_base; } - - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) @@ -3437,6 +3447,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; + vmx->guest_msrs_dirty = true; if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); -- cgit v1.2.3-55-g7522 From 354cb410d87314e2eda344feea84809e4261570a Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 8 Nov 2018 16:48:36 +0800 Subject: KVM: x86: fix empty-body warnings We get the following warnings about empty statements when building with 'W=1': arch/x86/kvm/lapic.c:632:53: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] arch/x86/kvm/lapic.c:1907:42: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] arch/x86/kvm/lapic.c:1936:65: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] arch/x86/kvm/lapic.c:1975:44: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] Rework the debug helper macro to get rid of these warnings. Signed-off-by: Yi Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 02f2291dcf7e..c4533d05c214 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -55,7 +55,7 @@ #define PRIo64 "o" /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) +#define apic_debug(fmt, arg...) do {} while (0) /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) -- cgit v1.2.3-55-g7522 From 1e4329ee2c52692ea42cc677fb2133519718b34a Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 8 Nov 2018 11:22:21 +0800 Subject: x86/kvm/vmx: fix old-style function declaration The inline keyword which is not at the beginning of the function declaration may trigger the following build warnings, so let's fix it: arch/x86/kvm/vmx.c:1309:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration] arch/x86/kvm/vmx.c:5947:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration] arch/x86/kvm/vmx.c:5985:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration] arch/x86/kvm/vmx.c:6023:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Yi Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d09d67310012..5f43fcfc225b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1308,7 +1308,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code); static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); -static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); static DEFINE_PER_CPU(struct vmcs *, vmxarea); @@ -5956,7 +5956,7 @@ static void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type) { int f = sizeof(unsigned long); @@ -5994,7 +5994,7 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit } } -static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type) { int f = sizeof(unsigned long); @@ -6032,7 +6032,7 @@ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitm } } -static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type, bool value) { if (value) -- cgit v1.2.3-55-g7522 From 326e742533bf0a23f0127d8ea62fb558ba665f08 Mon Sep 17 00:00:00 2001 From: Leonid Shatz Date: Tue, 6 Nov 2018 12:14:25 +0200 Subject: KVM: nVMX/nSVM: Fix bug which sets vcpu->arch.tsc_offset to L1 tsc_offset Since commit e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to represent the running guest"), vcpu->arch.tsc_offset meaning was changed to always reflect the tsc_offset value set on active VMCS. Regardless if vCPU is currently running L1 or L2. However, above mentioned commit failed to also change kvm_vcpu_write_tsc_offset() to set vcpu->arch.tsc_offset correctly. This is because vmx_write_tsc_offset() could set the tsc_offset value in active VMCS to given offset parameter *plus vmcs12->tsc_offset*. However, kvm_vcpu_write_tsc_offset() just sets vcpu->arch.tsc_offset to given offset parameter. Without taking into account the possible addition of vmcs12->tsc_offset. (Same is true for SVM case). Fix this issue by changing kvm_x86_ops->write_tsc_offset() to return actually set tsc_offset in active VMCS and modify kvm_vcpu_write_tsc_offset() to set returned value in vcpu->arch.tsc_offset. In addition, rename write_tsc_offset() callback to write_l1_tsc_offset() to make it clear that it is meant to set L1 TSC offset. Fixes: e79f245ddec1 ("X86/KVM: Properly update 'tsc_offset' to represent the running guest") Reviewed-by: Liran Alon Reviewed-by: Mihai Carabas Reviewed-by: Krish Sadhukhan Signed-off-by: Leonid Shatz Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/svm.c | 5 +++-- arch/x86/kvm/vmx.c | 21 +++++++++------------ arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 17 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55e51ff7e421..fbda5a917c5b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1094,7 +1094,8 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + /* Returns actual tsc_offset set in active VMCS */ + u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 033ec01661b9..a24733aade4c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) return vcpu->arch.tsc_offset; } -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); u64 g_tsc_offset = 0; @@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) svm->vmcb->control.tsc_offset = offset + g_tsc_offset; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + return svm->vmcb->control.tsc_offset; } static void avic_init_vmcb(struct vcpu_svm *svm) @@ -7152,7 +7153,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, .read_l1_tsc_offset = svm_read_l1_tsc_offset, - .write_tsc_offset = svm_write_tsc_offset, + .write_l1_tsc_offset = svm_write_l1_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5f43fcfc225b..764c23dc444f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3464,11 +3464,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) return vcpu->arch.tsc_offset; } -/* - * writes 'offset' into guest's timestamp counter offset register - */ -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + u64 active_offset = offset; if (is_guest_mode(vcpu)) { /* * We're here if L1 chose not to trap WRMSR to TSC. According @@ -3476,17 +3474,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * set for L2 remains unchanged, and still needs to be added * to the newly set TSC to get L2's TSC. */ - struct vmcs12 *vmcs12; - /* recalculate vmcs02.TSC_OFFSET: */ - vmcs12 = get_vmcs12(vcpu); - vmcs_write64(TSC_OFFSET, offset + - (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? - vmcs12->tsc_offset : 0)); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING)) + active_offset += vmcs12->tsc_offset; } else { trace_kvm_write_tsc_offset(vcpu->vcpu_id, vmcs_read64(TSC_OFFSET), offset); - vmcs_write64(TSC_OFFSET, offset); } + + vmcs_write64(TSC_OFFSET, active_offset); + return active_offset; } /* @@ -15074,7 +15071,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .read_l1_tsc_offset = vmx_read_l1_tsc_offset, - .write_tsc_offset = vmx_write_tsc_offset, + .write_l1_tsc_offset = vmx_write_l1_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e4be1f2f253..d02937760c3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1665,8 +1665,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - kvm_x86_ops->write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_offset = offset; + vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -1794,7 +1793,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment); + u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -- cgit v1.2.3-55-g7522 From 72aeb60c52bf74a0eeec77d6b41ce40145697d76 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Thu, 1 Nov 2018 10:57:39 +0200 Subject: KVM: nVMX: Verify eVMCS revision id match supported eVMCS version on eVMCS VMPTRLD According to TLFS section 16.11.2 Enlightened VMCS, the first u32 field of eVMCS should specify eVMCS VersionNumber. This version should be in the range of supported eVMCS versions exposed to guest via CPUID.0x4000000A.EAX[0:15]. The range which KVM expose to guest in this CPUID field should be the same as the value returned in vmcs_version by nested_enable_evmcs(). According to the above, eVMCS VMPTRLD should verify that version specified in given eVMCS is in the supported range. However, current code mistakenly verfies this field against VMCS12_REVISION. One can also see that when KVM use eVMCS, it makes sure that alloc_vmcs_cpu() sets allocated eVMCS revision_id to KVM_EVMCS_VERSION. Obvious fix should just change eVMCS VMPTRLD to verify first u32 field of eVMCS is equal to KVM_EVMCS_VERSION. However, it turns out that Microsoft Hyper-V fails to comply to their own invented interface: When Hyper-V use eVMCS, it just sets first u32 field of eVMCS to revision_id specified in MSR_IA32_VMX_BASIC (In our case: VMCS12_REVISION). Instead of used eVMCS version number which is one of the supported versions specified in CPUID.0x4000000A.EAX[0:15]. To overcome Hyper-V bug, we accept either a supported eVMCS version or VMCS12_REVISION as valid values for first u32 field of eVMCS. Cc: Vitaly Kuznetsov Reviewed-by: Nikita Leshenko Reviewed-by: Mark Kanda Signed-off-by: Liran Alon Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 764c23dc444f..0ffd8b2dbfe2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9378,7 +9378,30 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); - if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) { + /* + * Currently, KVM only supports eVMCS version 1 + * (== KVM_EVMCS_VERSION) and thus we expect guest to set this + * value to first u32 field of eVMCS which should specify eVMCS + * VersionNumber. + * + * Guest should be aware of supported eVMCS versions by host by + * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is + * expected to set this CPUID leaf according to the value + * returned in vmcs_version from nested_enable_evmcs(). + * + * However, it turns out that Microsoft Hyper-V fails to comply + * to their own invented interface: When Hyper-V use eVMCS, it + * just sets first u32 field of eVMCS to revision_id specified + * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number + * which is one of the supported versions specified in + * CPUID.0x4000000A.EAX[0:15]. + * + * To overcome Hyper-V bug, we accept here either a supported + * eVMCS version or VMCS12 revision_id as valid values for first + * u32 field of eVMCS. + */ + if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && + (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { nested_release_evmcs(vcpu); return 0; } -- cgit v1.2.3-55-g7522 From 52ad7eb3d668867f9ee2e34d715cfb4860d36a93 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Tue, 13 Nov 2018 17:44:46 +0200 Subject: KVM: nVMX: vmcs12 revision_id is always VMCS12_REVISION even when copied from eVMCS vmcs12 represents the per-CPU cache of L1 active vmcs12. This cache can be loaded by one of the following: 1) Guest making a vmcs12 active by exeucting VMPTRLD 2) Guest specifying eVMCS in VP assist page and executing VMLAUNCH/VMRESUME. Either way, vmcs12 should have revision_id of VMCS12_REVISION. Which is not equal to eVMCS revision_id which specifies used VersionNumber of eVMCS struct (e.g. KVM_EVMCS_VERSION). Specifically, this causes an issue in restoring a nested VM state because vmx_set_nested_state() verifies that vmcs12->revision_id is equal to VMCS12_REVISION which was not true in case vmcs12 was populated from an eVMCS by vmx_get_nested_state() which calls copy_enlightened_to_vmcs12(). Reviewed-by: Darren Kenny Signed-off-by: Liran Alon Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0ffd8b2dbfe2..02edd9960e9d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8673,8 +8673,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - vmcs12->hdr.revision_id = evmcs->revision_id; - /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; @@ -9422,9 +9420,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, * present in struct hv_enlightened_vmcs, ...). Make sure there * are no leftovers. */ - if (from_launch) - memset(vmx->nested.cached_vmcs12, 0, - sizeof(*vmx->nested.cached_vmcs12)); + if (from_launch) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + memset(vmcs12, 0, sizeof(*vmcs12)); + vmcs12->hdr.revision_id = VMCS12_REVISION; + } } return 1; -- cgit v1.2.3-55-g7522 From 0e0fee5c539b61fdd098332e0e2cc375d9073706 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 31 Oct 2018 14:53:57 -0700 Subject: kvm: mmu: Fix race in emulated page table writes When a guest page table is updated via an emulated write, kvm_mmu_pte_write() is called to update the shadow PTE using the just written guest PTE value. But if two emulated guest PTE writes happened concurrently, it is possible that the guest PTE and the shadow PTE end up being out of sync. Emulated writes do not mark the shadow page as unsync-ed, so this inconsistency will not be resolved even by a guest TLB flush (unless the page was marked as unsync-ed at some other point). This is fixed by re-reading the current value of the guest PTE after the MMU lock has been acquired instead of just using the value that was written prior to calling kvm_mmu_pte_write(). Signed-off-by: Junaid Shahid Reviewed-by: Wanpeng Li Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cf5f572f2305..7c03c0f35444 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5074,9 +5074,9 @@ static bool need_remote_flush(u64 old, u64 new) } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, - const u8 *new, int *bytes) + int *bytes) { - u64 gentry; + u64 gentry = 0; int r; /* @@ -5088,22 +5088,12 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); - if (r) - gentry = 0; - new = (const u8 *)&gentry; } - switch (*bytes) { - case 4: - gentry = *(const u32 *)new; - break; - case 8: - gentry = *(const u64 *)new; - break; - default: - gentry = 0; - break; + if (*bytes == 4 || *bytes == 8) { + r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); + if (r) + gentry = 0; } return gentry; @@ -5207,8 +5197,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); - /* * No need to care whether allocation memory is successful * or not since pte prefetch is skiped if it does not have @@ -5217,6 +5205,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_topup_memory_caches(vcpu); spin_lock(&vcpu->kvm->mmu_lock); + + gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); + ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); -- cgit v1.2.3-55-g7522 From fd65d3142f734bc4376053c8d75670041903134d Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 22 May 2018 09:54:20 -0700 Subject: kvm: svm: Ensure an IBPB on all affected CPUs when freeing a vmcb Previously, we only called indirect_branch_prediction_barrier on the logical CPU that freed a vmcb. This function should be called on all logical CPUs that last loaded the vmcb in question. Fixes: 15d45071523d ("KVM/x86: Add IBPB support") Reported-by: Neel Natu Signed-off-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a24733aade4c..cc6467b35a85 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2193,21 +2193,31 @@ out: return ERR_PTR(err); } +static void svm_clear_current_vmcb(struct vmcb *vmcb) +{ + int i; + + for_each_online_cpu(i) + cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); +} + static void svm_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * The vmcb page can be recycled, causing a false negative in + * svm_vcpu_load(). So, ensure that no logical CPU has this + * vmcb page recorded as its current vmcb. + */ + svm_clear_current_vmcb(svm->vmcb); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So do a full IBPB now. - */ - indirect_branch_prediction_barrier(); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -- cgit v1.2.3-55-g7522 From ac26d1f74cfc19c8dc9d533b5f20e99dbee3d9bd Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 27 Nov 2018 14:32:00 +0100 Subject: x86/fpu: Use the correct exception table macro in the XSTATE_OP wrapper Commit 75045f77f7a7 ("x86/extable: Introduce _ASM_EXTABLE_UA for uaccess fixups") incorrectly replaced the fixup entry for XSTATE_OP with a user-#PF-only fixup. XRSTOR can also raise #GP if the xstate content is invalid, and _ASM_EXTABLE_UA doesn't expect that. Change this fixup back to _ASM_EXTABLE so that #GP gets fixed up. Fixes: 75045f77f7a7 ("x86/extable: Introduce _ASM_EXTABLE_UA for uaccess fixups") Reported-by: Sebastian Andrzej Siewior Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: "H. Peter Anvin" Cc: "Naveen N. Rao" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: x86-ml Link: https://lkml.kernel.org/r/20181126165957.xhsyu2dhyy45mrjo@linutronix.de Link: https://lkml.kernel.org/r/20181127133200.38322-1-jannh@google.com --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5f7290e6e954..69dcdf195b61 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -226,7 +226,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) "3: movl $-2,%[err]\n\t" \ "jmp 2b\n\t" \ ".popsection\n\t" \ - _ASM_EXTABLE_UA(1b, 3b) \ + _ASM_EXTABLE(1b, 3b) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") -- cgit v1.2.3-55-g7522 From 07f7175b43827640d1e69c9eded89aa089a234b4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt (VMware) Date: Sun, 18 Nov 2018 17:14:10 -0500 Subject: x86/function_graph: Simplify with function_graph_enter() The function_graph_enter() function does the work of calling the function graph hook function and the management of the shadow stack, simplifying the work done in the architecture dependent prepare_ftrace_return(). Have x86 use the new code, and remove the shadow stack management as well as having to set up the trace structure. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 01ebcb6f263e..7ee8067cbf45 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -994,7 +994,6 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, { unsigned long old; int faulted; - struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) &return_to_handler; @@ -1046,19 +1045,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, return; } - trace.func = self_addr; - trace.depth = current->curr_ret_stack + 1; - - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { + if (function_graph_enter(old, self_addr, frame_pointer, parent)) *parent = old; - return; - } - - if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer, parent) == -EBUSY) { - *parent = old; - return; - } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3-55-g7522 From 60c8144afc287ef09ce8c1230c6aa972659ba1bb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 27 Nov 2018 14:41:37 +0100 Subject: x86/MCE/AMD: Fix the thresholding machinery initialization order Currently, the code sets up the thresholding interrupt vector and only then goes about initializing the thresholding banks. Which is wrong, because an early thresholding interrupt would cause a NULL pointer dereference when accessing those banks and prevent the machine from booting. Therefore, set the thresholding interrupt vector only *after* having initialized the banks successfully. Fixes: 18807ddb7f88 ("x86/mce/AMD: Reset Threshold Limit after logging error") Reported-by: Rafał Miłecki Reported-by: John Clemens Signed-off-by: Borislav Petkov Tested-by: Rafał Miłecki Tested-by: John Clemens Cc: Aravind Gopalakrishnan Cc: linux-edac@vger.kernel.org Cc: stable@vger.kernel.org Cc: Tony Luck Cc: x86@kernel.org Cc: Yazen Ghannam Link: https://lkml.kernel.org/r/20181127101700.2964-1-zajec5@gmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=201291 --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index dd33c357548f..e12454e21b8a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -56,7 +56,7 @@ /* Threshold LVT offset is at MSR0xC0000410[15:12] */ #define SMCA_THR_LVT_OFF 0xF000 -static bool thresholding_en; +static bool thresholding_irq_en; static const char * const th_names[] = { "load_store", @@ -534,9 +534,8 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, set_offset: offset = setup_APIC_mce_threshold(offset, new); - - if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) - mce_threshold_vector = amd_threshold_interrupt; + if (offset == new) + thresholding_irq_en = true; done: mce_threshold_block_init(&b, offset); @@ -1357,9 +1356,6 @@ int mce_threshold_remove_device(unsigned int cpu) { unsigned int bank; - if (!thresholding_en) - return 0; - for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; @@ -1377,9 +1373,6 @@ int mce_threshold_create_device(unsigned int cpu) struct threshold_bank **bp; int err = 0; - if (!thresholding_en) - return 0; - bp = per_cpu(threshold_banks, cpu); if (bp) return 0; @@ -1408,9 +1401,6 @@ static __init int threshold_init_device(void) { unsigned lcpu = 0; - if (mce_threshold_vector == amd_threshold_interrupt) - thresholding_en = true; - /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { int err = mce_threshold_create_device(lcpu); @@ -1419,6 +1409,9 @@ static __init int threshold_init_device(void) return err; } + if (thresholding_irq_en) + mce_threshold_vector = amd_threshold_interrupt; + return 0; } /* -- cgit v1.2.3-55-g7522 From 4cd24de3a0980bf3100c9dcb08ef65ca7c31af48 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 2 Nov 2018 01:45:41 -0700 Subject: x86/retpoline: Make CONFIG_RETPOLINE depend on compiler support Since retpoline capable compilers are widely available, make CONFIG_RETPOLINE hard depend on the compiler capability. Break the build when CONFIG_RETPOLINE is enabled and the compiler does not support it. Emit an error message in that case: "arch/x86/Makefile:226: *** You are building kernel with non-retpoline compiler, please update your compiler.. Stop." [dwmw: Fail the build with non-retpoline compiler] Suggested-by: Peter Zijlstra Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: Borislav Petkov Cc: Daniel Borkmann Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Andy Lutomirski Cc: Masahiro Yamada Cc: Michal Marek Cc: Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/cca0cb20-f9e2-4094-840b-fb0f8810cd34@default --- arch/x86/Kconfig | 4 ---- arch/x86/Makefile | 5 +++-- arch/x86/include/asm/nospec-branch.h | 10 ++++++---- arch/x86/kernel/cpu/bugs.c | 2 +- scripts/Makefile.build | 2 -- 5 files changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d734f3c8234..b5286ad2a982 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -444,10 +444,6 @@ config RETPOLINE branches. Requires a compiler with -mindirect-branch=thunk-extern support for full protection. The kernel may run slower. - Without compiler support, at least indirect branches in assembler - code are eliminated. Since this includes the syscall entry path, - it is not entirely pointless. - config INTEL_RDT bool "Intel Resource Director Technology support" depends on X86 && CPU_SUP_INTEL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 88398fdf8129..f5d7f4134524 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -220,9 +220,10 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre ifdef CONFIG_RETPOLINE -ifneq ($(RETPOLINE_CFLAGS),) - KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE +ifeq ($(RETPOLINE_CFLAGS),) + $(error You are building kernel with non-retpoline compiler, please update your compiler.) endif + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) endif archscripts: scripts_basic diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 80dc14422495..8b09cbb2d52c 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -162,11 +162,12 @@ _ASM_PTR " 999b\n\t" \ ".popsection\n\t" -#if defined(CONFIG_X86_64) && defined(RETPOLINE) +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_64 /* - * Since the inline asm uses the %V modifier which is only in newer GCC, - * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + * Inline asm uses the %V modifier which is only in newer GCC + * which is ensured when CONFIG_RETPOLINE is defined. */ # define CALL_NOSPEC \ ANNOTATE_NOSPEC_ALTERNATIVE \ @@ -181,7 +182,7 @@ X86_FEATURE_RETPOLINE_AMD) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) -#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +#else /* CONFIG_X86_32 */ /* * For i386 we use the original ret-equivalent retpoline, because * otherwise we'll run out of registers. We don't care about CET @@ -211,6 +212,7 @@ X86_FEATURE_RETPOLINE_AMD) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif #else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c37e66e493bf..d0108fb6e4dd 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -252,7 +252,7 @@ static void __init spec2_print_if_secure(const char *reason) static inline bool retp_compiler(void) { - return __is_defined(RETPOLINE); + return __is_defined(CONFIG_RETPOLINE); } static inline bool match_option(const char *arg, int arglen, const char *opt) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index a8e7ba9f73e8..6a6be9f440cf 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -236,10 +236,8 @@ ifdef CONFIG_GCOV_KERNEL objtool_args += --no-unreachable endif ifdef CONFIG_RETPOLINE -ifneq ($(RETPOLINE_CFLAGS),) objtool_args += --retpoline endif -endif ifdef CONFIG_MODVERSIONS -- cgit v1.2.3-55-g7522 From ef014aae8f1cd2793e4e014bbb102bed53f852b7 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 2 Nov 2018 01:45:41 -0700 Subject: x86/retpoline: Remove minimal retpoline support Now that CONFIG_RETPOLINE hard depends on compiler support, there is no reason to keep the minimal retpoline support around which only provided basic protection in the assembly files. Suggested-by: Peter Zijlstra Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/f06f0a89-5587-45db-8ed2-0a9d6638d5c0@default --- arch/x86/include/asm/nospec-branch.h | 2 -- arch/x86/kernel/cpu/bugs.c | 13 ++----------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8b09cbb2d52c..c202a64edd95 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -221,8 +221,6 @@ /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, - SPECTRE_V2_RETPOLINE_MINIMAL, - SPECTRE_V2_RETPOLINE_MINIMAL_AMD, SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, SPECTRE_V2_IBRS_ENHANCED, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d0108fb6e4dd..7f6d8159398e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -135,8 +135,6 @@ enum spectre_v2_mitigation_cmd { static const char *spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", - [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", @@ -250,11 +248,6 @@ static void __init spec2_print_if_secure(const char *reason) pr_info("%s selected on command line.\n", reason); } -static inline bool retp_compiler(void) -{ - return __is_defined(CONFIG_RETPOLINE); -} - static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -417,14 +410,12 @@ retpoline_auto: pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); goto retpoline_generic; } - mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : - SPECTRE_V2_RETPOLINE_MINIMAL_AMD; + mode = SPECTRE_V2_RETPOLINE_AMD; setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } else { retpoline_generic: - mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : - SPECTRE_V2_RETPOLINE_MINIMAL; + mode = SPECTRE_V2_RETPOLINE_GENERIC; setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } -- cgit v1.2.3-55-g7522 From 8eb729b77faf83ac4c1f363a9ad68d042415f24c Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 25 Nov 2018 19:33:29 +0100 Subject: x86/speculation: Update the TIF_SSBD comment "Reduced Data Speculation" is an obsolete term. The correct new name is "Speculative store bypass disable" - which is abbreviated into SSBD. Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185003.593893901@linutronix.de --- arch/x86/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2ff2a30a264f..523c69efc38a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -79,7 +79,7 @@ struct thread_info { #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ -#define TIF_SSBD 5 /* Reduced data speculation */ +#define TIF_SSBD 5 /* Speculative store bypass disable */ #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -- cgit v1.2.3-55-g7522 From 24848509aa55eac39d524b587b051f4e86df3c12 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 25 Nov 2018 19:33:30 +0100 Subject: x86/speculation: Clean up spectre_v2_parse_cmdline() Remove the unnecessary 'else' statement in spectre_v2_parse_cmdline() to save an indentation level. Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185003.688010903@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7f6d8159398e..839ab4103e89 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -276,22 +276,21 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) return SPECTRE_V2_CMD_NONE; - else { - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); - if (ret < 0) - return SPECTRE_V2_CMD_AUTO; - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { - if (!match_option(arg, ret, mitigation_options[i].option)) - continue; - cmd = mitigation_options[i].cmd; - break; - } + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_CMD_AUTO; - if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_CMD_AUTO; - } + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { + if (!match_option(arg, ret, mitigation_options[i].option)) + continue; + cmd = mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPECTRE_V2_CMD_AUTO; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || -- cgit v1.2.3-55-g7522 From b86bda0426853bfe8a3506c7d2a5b332760ae46b Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 25 Nov 2018 19:33:31 +0100 Subject: x86/speculation: Remove unnecessary ret variable in cpu_show_common() Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185003.783903657@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 839ab4103e89..b52a48966e01 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -847,8 +847,6 @@ static ssize_t l1tf_show_state(char *buf) static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { - int ret; - if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); @@ -866,13 +864,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "Mitigation: __user pointer sanitization\n"); case X86_BUG_SPECTRE_V2: - ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "", boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", spectre_v2_module_string()); - return ret; case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); -- cgit v1.2.3-55-g7522 From a8f76ae41cd633ac00be1b3019b1eb4741be3828 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 25 Nov 2018 19:33:32 +0100 Subject: x86/speculation: Move STIPB/IBPB string conditionals out of cpu_show_common() The Spectre V2 printout in cpu_show_common() handles conditionals for the various mitigation methods directly in the sprintf() argument list. That's hard to read and will become unreadable if more complex decisions need to be made for a particular method. Move the conditionals for STIBP and IBPB string selection into helper functions, so they can be extended later on. Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185003.874479208@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b52a48966e01..a1502bce9eb8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -844,6 +844,22 @@ static ssize_t l1tf_show_state(char *buf) } #endif +static char *stibp_state(void) +{ + if (x86_spec_ctrl_base & SPEC_CTRL_STIBP) + return ", STIBP"; + else + return ""; +} + +static char *ibpb_state(void) +{ + if (boot_cpu_has(X86_FEATURE_USE_IBPB)) + return ", IBPB"; + else + return ""; +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -865,9 +881,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SPECTRE_V2: return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "", + stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", spectre_v2_module_string()); -- cgit v1.2.3-55-g7522 From 34bce7c9690b1d897686aac89604ba7adc365556 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 25 Nov 2018 19:33:33 +0100 Subject: x86/speculation: Disable STIBP when enhanced IBRS is in use If enhanced IBRS is active, STIBP is redundant for mitigating Spectre v2 user space exploits from hyperthread sibling. Disable STIBP when enhanced IBRS is used. Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185003.966801480@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a1502bce9eb8..924cd06dd43b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -321,6 +321,10 @@ static bool stibp_needed(void) if (spectre_v2_enabled == SPECTRE_V2_NONE) return false; + /* Enhanced IBRS makes using STIBP unnecessary. */ + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + return false; + if (!boot_cpu_has(X86_FEATURE_STIBP)) return false; @@ -846,6 +850,9 @@ static ssize_t l1tf_show_state(char *buf) static char *stibp_state(void) { + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + return ""; + if (x86_spec_ctrl_base & SPEC_CTRL_STIBP) return ", STIBP"; else -- cgit v1.2.3-55-g7522 From 26c4d75b234040c11728a8acb796b3a85ba7507c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:34 +0100 Subject: x86/speculation: Rename SSBD update functions During context switch, the SSBD bit in SPEC_CTRL MSR is updated according to changes of the TIF_SSBD flag in the current and next running task. Currently, only the bit controlling speculative store bypass disable in SPEC_CTRL MSR is updated and the related update functions all have "speculative_store" or "ssb" in their names. For enhanced mitigation control other bits in SPEC_CTRL MSR need to be updated as well, which makes the SSB names inadequate. Rename the "speculative_store*" functions to a more generic name. No functional change. Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.058866968@linutronix.de --- arch/x86/include/asm/spec-ctrl.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 4 ++-- arch/x86/kernel/process.c | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index ae7c2c5cd7f0..8e2f8411c7a7 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -70,11 +70,11 @@ extern void speculative_store_bypass_ht_init(void); static inline void speculative_store_bypass_ht_init(void) { } #endif -extern void speculative_store_bypass_update(unsigned long tif); +extern void speculation_ctrl_update(unsigned long tif); -static inline void speculative_store_bypass_update_current(void) +static inline void speculation_ctrl_update_current(void) { - speculative_store_bypass_update(current_thread_info()->flags); + speculation_ctrl_update(current_thread_info()->flags); } #endif diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 924cd06dd43b..a723af0c4400 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -200,7 +200,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : ssbd_spec_ctrl_to_tif(hostval); - speculative_store_bypass_update(tif); + speculation_ctrl_update(tif); } } EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); @@ -632,7 +632,7 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) * mitigation until it is next scheduled. */ if (task == current && update) - speculative_store_bypass_update_current(); + speculation_ctrl_update_current(); return 0; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c93fcfdf1673..8aa49604f9ae 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -395,27 +395,27 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } -static __always_inline void intel_set_ssb_state(unsigned long tifn) +static __always_inline void spec_ctrl_update_msr(unsigned long tifn) { u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); wrmsrl(MSR_IA32_SPEC_CTRL, msr); } -static __always_inline void __speculative_store_bypass_update(unsigned long tifn) +static __always_inline void __speculation_ctrl_update(unsigned long tifn) { if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) amd_set_ssb_virt_state(tifn); else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) amd_set_core_ssb_state(tifn); else - intel_set_ssb_state(tifn); + spec_ctrl_update_msr(tifn); } -void speculative_store_bypass_update(unsigned long tif) +void speculation_ctrl_update(unsigned long tif) { preempt_disable(); - __speculative_store_bypass_update(tif); + __speculation_ctrl_update(tif); preempt_enable(); } @@ -452,7 +452,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); if ((tifp ^ tifn) & _TIF_SSBD) - __speculative_store_bypass_update(tifn); + __speculation_ctrl_update(tifn); } /* -- cgit v1.2.3-55-g7522 From 01daf56875ee0cd50ed496a09b20eb369b45dfa5 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 25 Nov 2018 19:33:35 +0100 Subject: x86/speculation: Reorganize speculation control MSRs update The logic to detect whether there's a change in the previous and next task's flag relevant to update speculation control MSRs is spread out across multiple functions. Consolidate all checks needed for updating speculation control MSRs into the new __speculation_ctrl_update() helper function. This makes it easy to pick the right speculation control MSR and the bits in MSR_IA32_SPEC_CTRL that need updating based on TIF flags changes. Originally-by: Thomas Lendacky Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.151077005@linutronix.de --- arch/x86/kernel/process.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8aa49604f9ae..70e9832379e1 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -395,27 +395,40 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } -static __always_inline void spec_ctrl_update_msr(unsigned long tifn) -{ - u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); - - wrmsrl(MSR_IA32_SPEC_CTRL, msr); -} +/* + * Update the MSRs managing speculation control, during context switch. + * + * tifp: Previous task's thread flags + * tifn: Next task's thread flags + */ +static __always_inline void __speculation_ctrl_update(unsigned long tifp, + unsigned long tifn) +{ + u64 msr = x86_spec_ctrl_base; + bool updmsr = false; + + /* If TIF_SSBD is different, select the proper mitigation method */ + if ((tifp ^ tifn) & _TIF_SSBD) { + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + amd_set_ssb_virt_state(tifn); + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + amd_set_core_ssb_state(tifn); + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + msr |= ssbd_tif_to_spec_ctrl(tifn); + updmsr = true; + } + } -static __always_inline void __speculation_ctrl_update(unsigned long tifn) -{ - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) - amd_set_ssb_virt_state(tifn); - else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) - amd_set_core_ssb_state(tifn); - else - spec_ctrl_update_msr(tifn); + if (updmsr) + wrmsrl(MSR_IA32_SPEC_CTRL, msr); } void speculation_ctrl_update(unsigned long tif) { + /* Forced update. Make sure all relevant TIF flags are different */ preempt_disable(); - __speculation_ctrl_update(tif); + __speculation_ctrl_update(~tif, tif); preempt_enable(); } @@ -451,8 +464,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); - if ((tifp ^ tifn) & _TIF_SSBD) - __speculation_ctrl_update(tifn); + __speculation_ctrl_update(tifp, tifn); } /* -- cgit v1.2.3-55-g7522 From dbe733642e01dd108f71436aaea7b328cb28fd87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:37 +0100 Subject: x86/Kconfig: Select SCHED_SMT if SMP enabled CONFIG_SCHED_SMT is enabled by all distros, so there is not a real point to have it configurable. The runtime overhead in the core scheduler code is minimal because the actual SMT scheduling parts are conditional on a static key. This allows to expose the scheduler's SMT state static key to the speculation control code. Alternatively the scheduler's static key could be made always available when CONFIG_SMP is enabled, but that's just adding an unused static key to every other architecture for nothing. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.337452245@linutronix.de --- arch/x86/Kconfig | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b5286ad2a982..8689e794a43c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1000,13 +1000,7 @@ config NR_CPUS to the kernel image. config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on SMP - ---help--- - SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a - cost of slightly increased overhead in some places. If unsure say - N here. + def_bool y if SMP config SCHED_MC def_bool y -- cgit v1.2.3-55-g7522 From a74cfffb03b73d41e08f84c2e5c87dec0ce3db9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:39 +0100 Subject: x86/speculation: Rework SMT state change arch_smt_update() is only called when the sysfs SMT control knob is changed. This means that when SMT is enabled in the sysfs control knob the system is considered to have SMT active even if all siblings are offline. To allow finegrained control of the speculation mitigations, the actual SMT state is more interesting than the fact that siblings could be enabled. Rework the code, so arch_smt_update() is invoked from each individual CPU hotplug function, and simplify the update function while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.521974984@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 11 +++++------ include/linux/sched/smt.h | 2 ++ kernel/cpu.c | 15 +++++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a723af0c4400..5625b323ff32 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -344,16 +345,14 @@ void arch_smt_update(void) return; mutex_lock(&spec_ctrl_mutex); - mask = x86_spec_ctrl_base; - if (cpu_smt_control == CPU_SMT_ENABLED) + + mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + if (sched_smt_active()) mask |= SPEC_CTRL_STIBP; - else - mask &= ~SPEC_CTRL_STIBP; if (mask != x86_spec_ctrl_base) { pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", - cpu_smt_control == CPU_SMT_ENABLED ? - "Enabling" : "Disabling"); + mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); x86_spec_ctrl_base = mask; on_each_cpu(update_stibp_msr, NULL, 1); } diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index c9e0be514110..59d3736c454c 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -15,4 +15,6 @@ static __always_inline bool sched_smt_active(void) static inline bool sched_smt_active(void) { return false; } #endif +void arch_smt_update(void); + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 3c7f3b4c453c..91d5c38eb7e5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void) #endif /* CONFIG_HOTPLUG_CPU */ +/* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ +void __weak arch_smt_update(void) { } + #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); @@ -1011,6 +1018,7 @@ out: * concurrent CPU hotplug via cpu_add_remove_lock. */ lockup_detector_cleanup(); + arch_smt_update(); return ret; } @@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) ret = cpuhp_up_callbacks(cpu, st, target); out: cpus_write_unlock(); + arch_smt_update(); return ret; } @@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -/* - * Architectures that need SMT-specific errata handling during SMT hotplug - * should override this. - */ -void __weak arch_smt_update(void) { }; - static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; -- cgit v1.2.3-55-g7522 From 130d6f946f6f2a972ee3ec8540b7243ab99abe97 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:40 +0100 Subject: x86/l1tf: Show actual SMT state Use the now exposed real SMT state, not the SMT sysfs control knob state. This reflects the state of the system when the mitigation status is queried. This does not change the warning in the VMX launch code. There the dependency on the control knob makes sense because siblings could be brought online anytime after launching the VM. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.613357354@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5625b323ff32..2dc4ee2bedcb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -832,13 +832,14 @@ static ssize_t l1tf_show_state(char *buf) if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && - cpu_smt_control == CPU_SMT_ENABLED)) + sched_smt_active())) { return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, l1tf_vmx_states[l1tf_vmx_mitigation]); + } return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, l1tf_vmx_states[l1tf_vmx_mitigation], - cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled"); + sched_smt_active() ? "vulnerable" : "disabled"); } #else static ssize_t l1tf_show_state(char *buf) -- cgit v1.2.3-55-g7522 From 15d6b7aab0793b2de8a05d8a828777dd24db424e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:41 +0100 Subject: x86/speculation: Reorder the spec_v2 code Reorder the code so it is better grouped. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.707122879@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 168 ++++++++++++++++++++++----------------------- 1 file changed, 84 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2dc4ee2bedcb..c9542b9fb329 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -124,29 +124,6 @@ void __init check_bugs(void) #endif } -/* The kernel command line selection */ -enum spectre_v2_mitigation_cmd { - SPECTRE_V2_CMD_NONE, - SPECTRE_V2_CMD_AUTO, - SPECTRE_V2_CMD_FORCE, - SPECTRE_V2_CMD_RETPOLINE, - SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, -}; - -static const char *spectre_v2_strings[] = { - [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", -}; - -#undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 : " fmt - -static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = - SPECTRE_V2_NONE; - void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) { @@ -216,6 +193,12 @@ static void x86_amd_ssb_disable(void) wrmsrl(MSR_AMD64_LS_CFG, msrval); } +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = + SPECTRE_V2_NONE; + #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -237,18 +220,6 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif -static void __init spec2_print_if_insecure(const char *reason) -{ - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - pr_info("%s selected on command line.\n", reason); -} - -static void __init spec2_print_if_secure(const char *reason) -{ - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - pr_info("%s selected on command line.\n", reason); -} - static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -256,24 +227,53 @@ static inline bool match_option(const char *arg, int arglen, const char *opt) return len == arglen && !strncmp(arg, opt, len); } +/* The kernel command line selection for spectre v2 */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", +}; + static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; } mitigation_options[] = { - { "off", SPECTRE_V2_CMD_NONE, false }, - { "on", SPECTRE_V2_CMD_FORCE, true }, - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, - { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "off", SPECTRE_V2_CMD_NONE, false }, + { "on", SPECTRE_V2_CMD_FORCE, true }, + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "auto", SPECTRE_V2_CMD_AUTO, false }, }; +static void __init spec2_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s selected on command line.\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s selected on command line.\n", reason); +} + static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; char arg[20]; int ret, i; - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) return SPECTRE_V2_CMD_NONE; @@ -317,48 +317,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } -static bool stibp_needed(void) -{ - if (spectre_v2_enabled == SPECTRE_V2_NONE) - return false; - - /* Enhanced IBRS makes using STIBP unnecessary. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return false; - - if (!boot_cpu_has(X86_FEATURE_STIBP)) - return false; - - return true; -} - -static void update_stibp_msr(void *info) -{ - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); -} - -void arch_smt_update(void) -{ - u64 mask; - - if (!stibp_needed()) - return; - - mutex_lock(&spec_ctrl_mutex); - - mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; - if (sched_smt_active()) - mask |= SPEC_CTRL_STIBP; - - if (mask != x86_spec_ctrl_base) { - pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", - mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); - x86_spec_ctrl_base = mask; - on_each_cpu(update_stibp_msr, NULL, 1); - } - mutex_unlock(&spec_ctrl_mutex); -} - static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -462,6 +420,48 @@ specv2_set_mode: arch_smt_update(); } +static bool stibp_needed(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE) + return false; + + /* Enhanced IBRS makes using STIBP unnecessary. */ + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + return false; + + if (!boot_cpu_has(X86_FEATURE_STIBP)) + return false; + + return true; +} + +static void update_stibp_msr(void *info) +{ + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); +} + +void arch_smt_update(void) +{ + u64 mask; + + if (!stibp_needed()) + return; + + mutex_lock(&spec_ctrl_mutex); + + mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + if (sched_smt_active()) + mask |= SPEC_CTRL_STIBP; + + if (mask != x86_spec_ctrl_base) { + pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", + mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); + x86_spec_ctrl_base = mask; + on_each_cpu(update_stibp_msr, NULL, 1); + } + mutex_unlock(&spec_ctrl_mutex); +} + #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt -- cgit v1.2.3-55-g7522 From 8770709f411763884535662744a3786a1806afd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:42 +0100 Subject: x86/speculation: Mark string arrays const correctly checkpatch.pl muttered when reshuffling the code: WARNING: static const char * array should probably be static const char * const Fix up all the string arrays. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.800018931@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c9542b9fb329..4fcbccb16200 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -237,7 +237,7 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_RETPOLINE_AMD, }; -static const char *spectre_v2_strings[] = { +static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", @@ -476,7 +476,7 @@ enum ssb_mitigation_cmd { SPEC_STORE_BYPASS_CMD_SECCOMP, }; -static const char *ssb_strings[] = { +static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", @@ -816,7 +816,7 @@ early_param("l1tf", l1tf_cmdline); #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" #if IS_ENABLED(CONFIG_KVM_INTEL) -static const char *l1tf_vmx_states[] = { +static const char * const l1tf_vmx_states[] = { [VMENTER_L1D_FLUSH_AUTO] = "auto", [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", -- cgit v1.2.3-55-g7522 From 30ba72a990f5096ae08f284de17986461efcc408 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:43 +0100 Subject: x86/speculataion: Mark command line parser data __initdata No point to keep that around. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.893886356@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4fcbccb16200..9279cbabe16e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -248,7 +248,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] = { +} mitigation_options[] __initdata = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -486,7 +486,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] = { +} ssb_mitigation_options[] __initdata = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ -- cgit v1.2.3-55-g7522 From 495d470e9828500e0155027f230449ac5e29c025 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:44 +0100 Subject: x86/speculation: Unify conditional spectre v2 print functions There is no point in having two functions and a conditional at the call site. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.986890749@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9279cbabe16e..4f5a6319dca6 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -257,15 +257,9 @@ static const struct { { "auto", SPECTRE_V2_CMD_AUTO, false }, }; -static void __init spec2_print_if_insecure(const char *reason) +static void __init spec_v2_print_cond(const char *reason, bool secure) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - pr_info("%s selected on command line.\n", reason); -} - -static void __init spec2_print_if_secure(const char *reason) -{ - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) pr_info("%s selected on command line.\n", reason); } @@ -309,11 +303,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (mitigation_options[i].secure) - spec2_print_if_secure(mitigation_options[i].option); - else - spec2_print_if_insecure(mitigation_options[i].option); - + spec_v2_print_cond(mitigation_options[i].option, + mitigation_options[i].secure); return cmd; } -- cgit v1.2.3-55-g7522 From fa1202ef224391b6f5b26cdd44cc50495e8fab54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:45 +0100 Subject: x86/speculation: Add command line control for indirect branch speculation Add command line control for user space indirect branch speculation mitigations. The new option is: spectre_v2_user= The initial options are: - on: Unconditionally enabled - off: Unconditionally disabled -auto: Kernel selects mitigation (default off for now) When the spectre_v2= command line argument is either 'on' or 'off' this implies that the application to application control follows that state even if a contradicting spectre_v2_user= argument is supplied. Originally-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.082720373@linutronix.de --- Documentation/admin-guide/kernel-parameters.txt | 32 +++++- arch/x86/include/asm/nospec-branch.h | 10 ++ arch/x86/kernel/cpu/bugs.c | 133 +++++++++++++++++++++--- 3 files changed, 156 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 19f4423e70d9..b6e5b33b9d75 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4194,9 +4194,13 @@ spectre_v2= [X86] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. + The default operation protects the kernel from + user space attacks. - on - unconditionally enable - off - unconditionally disable + on - unconditionally enable, implies + spectre_v2_user=on + off - unconditionally disable, implies + spectre_v2_user=off auto - kernel detects whether your CPU model is vulnerable @@ -4206,6 +4210,12 @@ CONFIG_RETPOLINE configuration option, and the compiler with which the kernel was built. + Selecting 'on' will also enable the mitigation + against user space to user space task attacks. + + Selecting 'off' will disable both the kernel and + the user space protections. + Specific mitigations can also be selected manually: retpoline - replace indirect branches @@ -4215,6 +4225,24 @@ Not specifying this option is equivalent to spectre_v2=auto. + spectre_v2_user= + [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability between + user space tasks + + on - Unconditionally enable mitigations. Is + enforced by spectre_v2=on + + off - Unconditionally disable mitigations. Is + enforced by spectre_v2=off + + auto - Kernel selects the mitigation depending on + the available CPU features and vulnerability. + Default is off. + + Not specifying this option is equivalent to + spectre_v2_user=auto. + spec_store_bypass_disable= [HW] Control Speculative Store Bypass (SSB) Disable mitigation (Speculative Store Bypass vulnerability) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c202a64edd95..be0b0aa780e2 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -3,6 +3,8 @@ #ifndef _ASM_X86_NOSPEC_BRANCH_H_ #define _ASM_X86_NOSPEC_BRANCH_H_ +#include + #include #include #include @@ -226,6 +228,12 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS_ENHANCED, }; +/* The indirect branch speculation control variants */ +enum spectre_v2_user_mitigation { + SPECTRE_V2_USER_NONE, + SPECTRE_V2_USER_STRICT, +}; + /* The Speculative Store Bypass disable variants */ enum ssb_mitigation { SPEC_STORE_BYPASS_NONE, @@ -303,6 +311,8 @@ do { \ preempt_enable(); \ } while (0) +DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4f5a6319dca6..3a223cce1fac 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -54,6 +54,9 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; u64 __ro_after_init x86_amd_ls_cfg_base; u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; +/* Control conditional STIPB in switch_to() */ +DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); + void __init check_bugs(void) { identify_boot_cpu(); @@ -199,6 +202,9 @@ static void x86_amd_ssb_disable(void) static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = + SPECTRE_V2_USER_NONE; + #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -237,6 +243,104 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_RETPOLINE_AMD, }; +enum spectre_v2_user_cmd { + SPECTRE_V2_USER_CMD_NONE, + SPECTRE_V2_USER_CMD_AUTO, + SPECTRE_V2_USER_CMD_FORCE, +}; + +static const char * const spectre_v2_user_strings[] = { + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", +}; + +static const struct { + const char *option; + enum spectre_v2_user_cmd cmd; + bool secure; +} v2_user_options[] __initdata = { + { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, + { "off", SPECTRE_V2_USER_CMD_NONE, false }, + { "on", SPECTRE_V2_USER_CMD_FORCE, true }, +}; + +static void __init spec_v2_user_print_cond(const char *reason, bool secure) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) + pr_info("spectre_v2_user=%s forced on command line.\n", reason); +} + +static enum spectre_v2_user_cmd __init +spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) +{ + char arg[20]; + int ret, i; + + switch (v2_cmd) { + case SPECTRE_V2_CMD_NONE: + return SPECTRE_V2_USER_CMD_NONE; + case SPECTRE_V2_CMD_FORCE: + return SPECTRE_V2_USER_CMD_FORCE; + default: + break; + } + + ret = cmdline_find_option(boot_command_line, "spectre_v2_user", + arg, sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_USER_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { + if (match_option(arg, ret, v2_user_options[i].option)) { + spec_v2_user_print_cond(v2_user_options[i].option, + v2_user_options[i].secure); + return v2_user_options[i].cmd; + } + } + + pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); + return SPECTRE_V2_USER_CMD_AUTO; +} + +static void __init +spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) +{ + enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; + bool smt_possible = IS_ENABLED(CONFIG_SMP); + + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + smt_possible = false; + + switch (spectre_v2_parse_user_cmdline(v2_cmd)) { + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_NONE: + goto set_mode; + case SPECTRE_V2_USER_CMD_FORCE: + mode = SPECTRE_V2_USER_STRICT; + break; + } + + /* Initialize Indirect Branch Prediction Barrier */ + if (boot_cpu_has(X86_FEATURE_IBPB)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); + } + + /* If enhanced IBRS is enabled no STIPB required */ + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + return; + +set_mode: + spectre_v2_user = mode; + /* Only print the STIBP mode when SMT possible */ + if (smt_possible) + pr_info("%s\n", spectre_v2_user_strings[mode]); +} + static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", @@ -385,12 +489,6 @@ specv2_set_mode: setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - /* Initialize Indirect Branch Prediction Barrier if supported */ - if (boot_cpu_has(X86_FEATURE_IBPB)) { - setup_force_cpu_cap(X86_FEATURE_USE_IBPB); - pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); - } - /* * Retpoline means the kernel is safe because it has no indirect * branches. Enhanced IBRS protects firmware too, so, enable restricted @@ -407,23 +505,21 @@ specv2_set_mode: pr_info("Enabling Restricted Speculation for firmware calls\n"); } + /* Set up IBPB and STIBP depending on the general spectre V2 command */ + spectre_v2_user_select_mitigation(cmd); + /* Enable STIBP if appropriate */ arch_smt_update(); } static bool stibp_needed(void) { - if (spectre_v2_enabled == SPECTRE_V2_NONE) - return false; - /* Enhanced IBRS makes using STIBP unnecessary. */ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return false; - if (!boot_cpu_has(X86_FEATURE_STIBP)) - return false; - - return true; + /* Check for strict user mitigation mode */ + return spectre_v2_user == SPECTRE_V2_USER_STRICT; } static void update_stibp_msr(void *info) @@ -844,10 +940,13 @@ static char *stibp_state(void) if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return ""; - if (x86_spec_ctrl_base & SPEC_CTRL_STIBP) - return ", STIBP"; - else - return ""; + switch (spectre_v2_user) { + case SPECTRE_V2_USER_NONE: + return ", STIBP: disabled"; + case SPECTRE_V2_USER_STRICT: + return ", STIBP: forced"; + } + return ""; } static char *ibpb_state(void) -- cgit v1.2.3-55-g7522 From 5bfbe3ad5840d941b89bcac54b821ba14f50a0ba Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 25 Nov 2018 19:33:46 +0100 Subject: x86/speculation: Prepare for per task indirect branch speculation control To avoid the overhead of STIBP always on, it's necessary to allow per task control of STIBP. Add a new task flag TIF_SPEC_IB and evaluate it during context switch if SMT is active and flag evaluation is enabled by the speculation control code. Add the conditional evaluation to x86_virt_spec_ctrl() as well so the guest/host switch works properly. This has no effect because TIF_SPEC_IB cannot be set yet and the static key which controls evaluation is off. Preparatory patch for adding the control code. [ tglx: Simplify the context switch logic and make the TIF evaluation depend on SMP=y and on the static key controlling the conditional update. Rename it to TIF_SPEC_IB because it controls both STIBP and IBPB ] Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.176917199@linutronix.de --- arch/x86/include/asm/msr-index.h | 5 +++-- arch/x86/include/asm/spec-ctrl.h | 12 ++++++++++++ arch/x86/include/asm/thread_info.h | 5 ++++- arch/x86/kernel/cpu/bugs.c | 4 ++++ arch/x86/kernel/process.c | 20 ++++++++++++++++++-- 5 files changed, 41 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 80f4a4f38c79..c8f73efb4ece 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -41,9 +41,10 @@ #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ #define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ -#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ +#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ +#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ -#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index 8e2f8411c7a7..27b0bce3933b 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); } +static inline u64 stibp_tif_to_spec_ctrl(u64 tifn) +{ + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); + return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); +} + static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) { BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); } +static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl) +{ + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); + return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); +} + static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) { return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 523c69efc38a..fa583ec99e3e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,6 +83,7 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ @@ -110,6 +111,7 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) @@ -146,7 +148,8 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) + (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \ + _TIF_SSBD|_TIF_SPEC_IB) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3a223cce1fac..1e13dbfc0919 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -148,6 +148,10 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) static_cpu_has(X86_FEATURE_AMD_SSBD)) hostval |= ssbd_tif_to_spec_ctrl(ti->flags); + /* Conditional STIBP enabled? */ + if (static_branch_unlikely(&switch_to_cond_stibp)) + hostval |= stibp_tif_to_spec_ctrl(ti->flags); + if (hostval != guestval) { msrval = setguest ? guestval : hostval; wrmsrl(MSR_IA32_SPEC_CTRL, msrval); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 70e9832379e1..574b144d2b53 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -404,11 +404,17 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) static __always_inline void __speculation_ctrl_update(unsigned long tifp, unsigned long tifn) { + unsigned long tif_diff = tifp ^ tifn; u64 msr = x86_spec_ctrl_base; bool updmsr = false; - /* If TIF_SSBD is different, select the proper mitigation method */ - if ((tifp ^ tifn) & _TIF_SSBD) { + /* + * If TIF_SSBD is different, select the proper mitigation + * method. Note that if SSBD mitigation is disabled or permanentely + * enabled this branch can't be taken because nothing can set + * TIF_SSBD. + */ + if (tif_diff & _TIF_SSBD) { if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { amd_set_ssb_virt_state(tifn); } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { @@ -420,6 +426,16 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } } + /* + * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, + * otherwise avoid the MSR write. + */ + if (IS_ENABLED(CONFIG_SMP) && + static_branch_unlikely(&switch_to_cond_stibp)) { + updmsr |= !!(tif_diff & _TIF_SPEC_IB); + msr |= stibp_tif_to_spec_ctrl(tifn); + } + if (updmsr) wrmsrl(MSR_IA32_SPEC_CTRL, msr); } -- cgit v1.2.3-55-g7522 From ff16701a29cba3aafa0bd1656d766813b2d0a811 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:47 +0100 Subject: x86/process: Consolidate and simplify switch_to_xtra() code Move the conditional invocation of __switch_to_xtra() into an inline function so the logic can be shared between 32 and 64 bit. Remove the handthrough of the TSS pointer and retrieve the pointer directly in the bitmap handling function. Use this_cpu_ptr() instead of the per_cpu() indirection. This is a preparatory change so integration of conditional indirect branch speculation optimization happens only in one place. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.280855518@linutronix.de --- arch/x86/include/asm/switch_to.h | 3 --- arch/x86/kernel/process.c | 12 +++++++----- arch/x86/kernel/process.h | 24 ++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 10 +++------- arch/x86/kernel/process_64.c | 10 +++------- 5 files changed, 37 insertions(+), 22 deletions(-) create mode 100644 arch/x86/kernel/process.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 36bd243843d6..7cf1a270d891 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -11,9 +11,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -struct tss_struct; -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss); /* This runs runs on the previous thread's stack. */ static inline void prepare_switch_to(struct task_struct *next) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 574b144d2b53..cdf8e6694f71 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -40,6 +40,8 @@ #include #include +#include "process.h" + /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned @@ -252,11 +254,12 @@ void arch_setup_new_exec(void) enable_cpuid(); } -static inline void switch_to_bitmap(struct tss_struct *tss, - struct thread_struct *prev, +static inline void switch_to_bitmap(struct thread_struct *prev, struct thread_struct *next, unsigned long tifp, unsigned long tifn) { + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. @@ -448,8 +451,7 @@ void speculation_ctrl_update(unsigned long tif) preempt_enable(); } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev, *next; unsigned long tifp, tifn; @@ -459,7 +461,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, tifn = READ_ONCE(task_thread_info(next_p)->flags); tifp = READ_ONCE(task_thread_info(prev_p)->flags); - switch_to_bitmap(tss, prev, next, tifp, tifn); + switch_to_bitmap(prev, next, tifp, tifn); propagate_user_return_notify(prev_p, next_p); diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h new file mode 100644 index 000000000000..020fbfac3a27 --- /dev/null +++ b/arch/x86/kernel/process.h @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Code shared between 32 and 64 bit + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); + +/* + * This needs to be inline to optimize for the common case where no extra + * work needs to be done. + */ +static inline void switch_to_extra(struct task_struct *prev, + struct task_struct *next) +{ + unsigned long next_tif = task_thread_info(next)->flags; + unsigned long prev_tif = task_thread_info(prev)->flags; + + /* + * __switch_to_xtra() handles debug registers, i/o bitmaps, + * speculation mitigations etc. + */ + if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT || + prev_tif & _TIF_WORK_CTXSW_PREV)) + __switch_to_xtra(prev, next); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5046a3c9dec2..d3e593eb189f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -59,6 +59,8 @@ #include #include +#include "process.h" + void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -232,7 +234,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -264,12 +265,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) set_iopl_mask(next->iopl); - /* - * Now maybe handle debug registers and/or IO bitmaps - */ - if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || - task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) - __switch_to_xtra(prev_p, next_p, tss); + switch_to_extra(prev_p, next_p); /* * Leave lazy mode, flushing any hypercalls made here. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0e0b4288a4b2..bbfbf017065c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -60,6 +60,8 @@ #include #endif +#include "process.h" + /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { @@ -553,7 +555,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); @@ -617,12 +618,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Reload sp0. */ update_task_stack(next_p); - /* - * Now maybe reload the debug registers and handle I/O bitmaps - */ - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || - task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) - __switch_to_xtra(prev_p, next_p, tss); + switch_to_extra(prev_p, next_p); #ifdef CONFIG_XEN_PV /* -- cgit v1.2.3-55-g7522 From 5635d99953f04b550738f6f4c1c532667c3fd872 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:48 +0100 Subject: x86/speculation: Avoid __switch_to_xtra() calls The TIF_SPEC_IB bit does not need to be evaluated in the decision to invoke __switch_to_xtra() when: - CONFIG_SMP is disabled - The conditional STIPB mode is disabled The TIF_SPEC_IB bit still controls IBPB in both cases so the TIF work mask checks might invoke __switch_to_xtra() for nothing if TIF_SPEC_IB is the only set bit in the work masks. Optimize it out by masking the bit at compile time for CONFIG_SMP=n and at run time when the static key controlling the conditional STIBP mode is disabled. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.374062201@linutronix.de --- arch/x86/include/asm/thread_info.h | 13 +++++++++++-- arch/x86/kernel/process.h | 15 +++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fa583ec99e3e..6d201699c651 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -147,9 +147,18 @@ struct thread_info { _TIF_FSCHECK) /* flags to check in __switch_to() */ -#define _TIF_WORK_CTXSW \ +#define _TIF_WORK_CTXSW_BASE \ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \ - _TIF_SSBD|_TIF_SPEC_IB) + _TIF_SSBD) + +/* + * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. + */ +#ifdef CONFIG_SMP +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB) +#else +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE) +#endif #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 020fbfac3a27..898e97cf6629 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -2,6 +2,8 @@ // // Code shared between 32 and 64 bit +#include + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); /* @@ -14,6 +16,19 @@ static inline void switch_to_extra(struct task_struct *prev, unsigned long next_tif = task_thread_info(next)->flags; unsigned long prev_tif = task_thread_info(prev)->flags; + if (IS_ENABLED(CONFIG_SMP)) { + /* + * Avoid __switch_to_xtra() invocation when conditional + * STIPB is disabled and the only different bit is + * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not + * in the TIF_WORK_CTXSW masks. + */ + if (!static_branch_likely(&switch_to_cond_stibp)) { + prev_tif &= ~_TIF_SPEC_IB; + next_tif &= ~_TIF_SPEC_IB; + } + } + /* * __switch_to_xtra() handles debug registers, i/o bitmaps, * speculation mitigations etc. -- cgit v1.2.3-55-g7522 From 4c71a2b6fd7e42814aa68a6dec88abf3b42ea573 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:49 +0100 Subject: x86/speculation: Prepare for conditional IBPB in switch_mm() The IBPB speculation barrier is issued from switch_mm() when the kernel switches to a user space task with a different mm than the user space task which ran last on the same CPU. An additional optimization is to avoid IBPB when the incoming task can be ptraced by the outgoing task. This optimization only works when switching directly between two user space tasks. When switching from a kernel task to a user space task the optimization fails because the previous task cannot be accessed anymore. So for quite some scenarios the optimization is just adding overhead. The upcoming conditional IBPB support will issue IBPB only for user space tasks which have the TIF_SPEC_IB bit set. This requires to handle the following cases: 1) Switch from a user space task (potential attacker) which has TIF_SPEC_IB set to a user space task (potential victim) which has TIF_SPEC_IB not set. 2) Switch from a user space task (potential attacker) which has TIF_SPEC_IB not set to a user space task (potential victim) which has TIF_SPEC_IB set. This needs to be optimized for the case where the IBPB can be avoided when only kernel threads ran in between user space tasks which belong to the same process. The current check whether two tasks belong to the same context is using the tasks context id. While correct, it's simpler to use the mm pointer because it allows to mangle the TIF_SPEC_IB bit into it. The context id based mechanism requires extra storage, which creates worse code. When a task is scheduled out its TIF_SPEC_IB bit is mangled as bit 0 into the per CPU storage which is used to track the last user space mm which was running on a CPU. This bit can be used together with the TIF_SPEC_IB bit of the incoming task to make the decision whether IBPB needs to be issued or not to cover the two cases above. As conditional IBPB is going to be the default, remove the dubious ptrace check for the IBPB always case and simply issue IBPB always when the process changes. Move the storage to a different place in the struct as the original one created a hole. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.466447057@linutronix.de --- arch/x86/include/asm/nospec-branch.h | 2 + arch/x86/include/asm/tlbflush.h | 8 ++- arch/x86/kernel/cpu/bugs.c | 29 +++++++-- arch/x86/mm/tlb.c | 115 ++++++++++++++++++++++++++--------- 4 files changed, 118 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index be0b0aa780e2..d4d35baf0430 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -312,6 +312,8 @@ do { \ } while (0) DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); +DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d760611cfc35..f4204bf377fc 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -169,10 +169,14 @@ struct tlb_state { #define LOADED_MM_SWITCHING ((struct mm_struct *)1) + /* Last user mm for optimizing IBPB */ + union { + struct mm_struct *last_user_mm; + unsigned long last_user_mm_ibpb; + }; + u16 loaded_mm_asid; u16 next_asid; - /* last user mm's ctx id */ - u64 last_ctx_id; /* * We can be in one of several states: diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1e13dbfc0919..7c946a9af947 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -56,6 +56,10 @@ u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; /* Control conditional STIPB in switch_to() */ DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); +/* Control conditional IBPB in switch_mm() */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); +/* Control unconditional IBPB in switch_mm() */ +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); void __init check_bugs(void) { @@ -331,7 +335,17 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); - pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); + + switch (mode) { + case SPECTRE_V2_USER_STRICT: + static_branch_enable(&switch_mm_always_ibpb); + break; + default: + break; + } + + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + mode == SPECTRE_V2_USER_STRICT ? "always-on" : "conditional"); } /* If enhanced IBRS is enabled no STIPB required */ @@ -955,10 +969,15 @@ static char *stibp_state(void) static char *ibpb_state(void) { - if (boot_cpu_has(X86_FEATURE_USE_IBPB)) - return ", IBPB"; - else - return ""; + if (boot_cpu_has(X86_FEATURE_IBPB)) { + switch (spectre_v2_user) { + case SPECTRE_V2_USER_NONE: + return ", IBPB: disabled"; + case SPECTRE_V2_USER_STRICT: + return ", IBPB: always-on"; + } + } + return ""; } static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bddd6b3cee1d..03b6b4c2238d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -30,6 +29,12 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +/* + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_ibpb. + */ +#define LAST_USER_MM_IBPB 0x1UL + /* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the @@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) } } -static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +{ + unsigned long next_tif = task_thread_info(next)->flags; + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + + return (unsigned long)next->mm | ibpb; +} + +static void cond_ibpb(struct task_struct *next) { + if (!next || !next->mm) + return; + /* - * Check if the current (previous) task has access to the memory - * of the @tsk (next) task. If access is denied, make sure to - * issue a IBPB to stop user->user Spectre-v2 attacks. - * - * Note: __ptrace_may_access() returns 0 or -ERRNO. + * Both, the conditional and the always IBPB mode use the mm + * pointer to avoid the IBPB when switching between tasks of the + * same process. Using the mm pointer instead of mm->context.ctx_id + * opens a hypothetical hole vs. mm_struct reuse, which is more or + * less impossible to control by an attacker. Aside of that it + * would only affect the first schedule so the theoretically + * exposed data is not really interesting. */ - return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && - ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); + if (static_branch_likely(&switch_mm_cond_ibpb)) { + unsigned long prev_mm, next_mm; + + /* + * This is a bit more complex than the always mode because + * it has to handle two cases: + * + * 1) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB set to a user space task + * (potential victim) which has TIF_SPEC_IB not set. + * + * 2) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB not set to a user space task + * (potential victim) which has TIF_SPEC_IB set. + * + * This could be done by unconditionally issuing IBPB when + * a task which has TIF_SPEC_IB set is either scheduled in + * or out. Though that results in two flushes when: + * + * - the same user space task is scheduled out and later + * scheduled in again and only a kernel thread ran in + * between. + * + * - a user space task belonging to the same process is + * scheduled in after a kernel thread ran in between + * + * - a user space task belonging to the same process is + * scheduled in immediately. + * + * Optimize this with reasonably small overhead for the + * above cases. Mangle the TIF_SPEC_IB bit into the mm + * pointer of the incoming task which is stored in + * cpu_tlbstate.last_user_mm_ibpb for comparison. + */ + next_mm = mm_mangle_tif_spec_ib(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); + + /* + * Issue IBPB only if the mm's are different and one or + * both have the IBPB bit set. + */ + if (next_mm != prev_mm && + (next_mm | prev_mm) & LAST_USER_MM_IBPB) + indirect_branch_prediction_barrier(); + + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); + } + + if (static_branch_unlikely(&switch_mm_always_ibpb)) { + /* + * Only flush when switching to a user space task with a + * different context than the user space task which ran + * last on this CPU. + */ + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + indirect_branch_prediction_barrier(); + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); + } + } } void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, @@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, new_asid = prev_asid; need_flush = true; } else { - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); - /* * Avoid user/user BTB poisoning by flushing the branch * predictor when switching between processes. This stops * one process from doing Spectre-v2 attacks on another. - * - * As an optimization, flush indirect branches only when - * switching into a processes that can't be ptrace by the - * current one (as in such case, attacker has much more - * convenient way how to tamper with the next process than - * branch buffer poisoning). */ - if (static_cpu_has(X86_FEATURE_USE_IBPB) && - ibpb_needed(tsk, last_ctx_id)) - indirect_branch_prediction_barrier(); + cond_ibpb(tsk); if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* @@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } - /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. - */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - /* Make sure we write CR3 before loaded_mm. */ barrier(); @@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); -- cgit v1.2.3-55-g7522 From e6da8bb6f9abb2628381904b24163c770e630bac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:51 +0100 Subject: x86/speculation: Split out TIF update The update of the TIF_SSBD flag and the conditional speculation control MSR update is done in the ssb_prctl_set() function directly. The upcoming prctl support for controlling indirect branch speculation via STIBP needs the same mechanism. Split the code out and make it reusable. Reword the comment about updates for other tasks. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.652305076@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7c946a9af947..3b65a53d2c33 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -702,10 +702,29 @@ static void ssb_select_mitigation(void) #undef pr_fmt #define pr_fmt(fmt) "Speculation prctl: " fmt -static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +static void task_update_spec_tif(struct task_struct *tsk, int tifbit, bool on) { bool update; + if (on) + update = !test_and_set_tsk_thread_flag(tsk, tifbit); + else + update = test_and_clear_tsk_thread_flag(tsk, tifbit); + + /* + * Immediately update the speculation control MSRs for the current + * task, but for a non-current task delay setting the CPU + * mitigation until it is scheduled next. + * + * This can only happen for SECCOMP mitigation. For PRCTL it's + * always the current task. + */ + if (tsk == current && update) + speculation_ctrl_update_current(); +} + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && ssb_mode != SPEC_STORE_BYPASS_SECCOMP) return -ENXIO; @@ -716,28 +735,20 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) if (task_spec_ssb_force_disable(task)) return -EPERM; task_clear_spec_ssb_disable(task); - update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); + task_update_spec_tif(task, TIF_SSBD, false); break; case PR_SPEC_DISABLE: task_set_spec_ssb_disable(task); - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + task_update_spec_tif(task, TIF_SSBD, true); break; case PR_SPEC_FORCE_DISABLE: task_set_spec_ssb_disable(task); task_set_spec_ssb_force_disable(task); - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + task_update_spec_tif(task, TIF_SSBD, true); break; default: return -ERANGE; } - - /* - * If being set on non-current task, delay setting the CPU - * mitigation until it is next scheduled. - */ - if (task == current && update) - speculation_ctrl_update_current(); - return 0; } -- cgit v1.2.3-55-g7522 From 6d991ba509ebcfcc908e009d1db51972a4f7a064 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 28 Nov 2018 10:56:57 +0100 Subject: x86/speculation: Prevent stale SPEC_CTRL msr content The seccomp speculation control operates on all tasks of a process, but only the current task of a process can update the MSR immediately. For the other threads the update is deferred to the next context switch. This creates the following situation with Process A and B: Process A task 2 and Process B task 1 are pinned on CPU1. Process A task 2 does not have the speculation control TIF bit set. Process B task 1 has the speculation control TIF bit set. CPU0 CPU1 MSR bit is set ProcB.T1 schedules out ProcA.T2 schedules in MSR bit is cleared ProcA.T1 seccomp_update() set TIF bit on ProcA.T2 ProcB.T1 schedules in MSR is not updated <-- FAIL This happens because the context switch code tries to avoid the MSR update if the speculation control TIF bits of the incoming and the outgoing task are the same. In the worst case ProcB.T1 and ProcA.T2 are the only tasks scheduling back and forth on CPU1, which keeps the MSR stale forever. In theory this could be remedied by IPIs, but chasing the remote task which could be migrated is complex and full of races. The straight forward solution is to avoid the asychronous update of the TIF bit and defer it to the next context switch. The speculation control state is stored in task_struct::atomic_flags by the prctl and seccomp updates already. Add a new TIF_SPEC_FORCE_UPDATE bit and set this after updating the atomic_flags. Check the bit on context switch and force a synchronous update of the speculation control if set. Use the same mechanism for updating the current task. Reported-by: Tim Chen Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1811272247140.1875@nanos.tec.linutronix.de --- arch/x86/include/asm/spec-ctrl.h | 6 +----- arch/x86/include/asm/thread_info.h | 4 +++- arch/x86/kernel/cpu/bugs.c | 18 +++++++----------- arch/x86/kernel/process.c | 30 +++++++++++++++++++++++++++++- 4 files changed, 40 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index 27b0bce3933b..5393babc0598 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -83,10 +83,6 @@ static inline void speculative_store_bypass_ht_init(void) { } #endif extern void speculation_ctrl_update(unsigned long tif); - -static inline void speculation_ctrl_update_current(void) -{ - speculation_ctrl_update(current_thread_info()->flags); -} +extern void speculation_ctrl_update_current(void); #endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 6d201699c651..82b73b75d67c 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -84,6 +84,7 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ +#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ @@ -112,6 +113,7 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) +#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) @@ -149,7 +151,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \ - _TIF_SSBD) + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) /* * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3b65a53d2c33..29f40a92f5a8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -702,14 +702,10 @@ static void ssb_select_mitigation(void) #undef pr_fmt #define pr_fmt(fmt) "Speculation prctl: " fmt -static void task_update_spec_tif(struct task_struct *tsk, int tifbit, bool on) +static void task_update_spec_tif(struct task_struct *tsk) { - bool update; - - if (on) - update = !test_and_set_tsk_thread_flag(tsk, tifbit); - else - update = test_and_clear_tsk_thread_flag(tsk, tifbit); + /* Force the update of the real TIF bits */ + set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); /* * Immediately update the speculation control MSRs for the current @@ -719,7 +715,7 @@ static void task_update_spec_tif(struct task_struct *tsk, int tifbit, bool on) * This can only happen for SECCOMP mitigation. For PRCTL it's * always the current task. */ - if (tsk == current && update) + if (tsk == current) speculation_ctrl_update_current(); } @@ -735,16 +731,16 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) if (task_spec_ssb_force_disable(task)) return -EPERM; task_clear_spec_ssb_disable(task); - task_update_spec_tif(task, TIF_SSBD, false); + task_update_spec_tif(task); break; case PR_SPEC_DISABLE: task_set_spec_ssb_disable(task); - task_update_spec_tif(task, TIF_SSBD, true); + task_update_spec_tif(task); break; case PR_SPEC_FORCE_DISABLE: task_set_spec_ssb_disable(task); task_set_spec_ssb_force_disable(task); - task_update_spec_tif(task, TIF_SSBD, true); + task_update_spec_tif(task); break; default: return -ERANGE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cdf8e6694f71..afbe2eb4a1c6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -443,6 +443,18 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, wrmsrl(MSR_IA32_SPEC_CTRL, msr); } +static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) +{ + if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { + if (task_spec_ssb_disable(tsk)) + set_tsk_thread_flag(tsk, TIF_SSBD); + else + clear_tsk_thread_flag(tsk, TIF_SSBD); + } + /* Return the updated threadinfo flags*/ + return task_thread_info(tsk)->flags; +} + void speculation_ctrl_update(unsigned long tif) { /* Forced update. Make sure all relevant TIF flags are different */ @@ -451,6 +463,14 @@ void speculation_ctrl_update(unsigned long tif) preempt_enable(); } +/* Called from seccomp/prctl update */ +void speculation_ctrl_update_current(void) +{ + preempt_disable(); + speculation_ctrl_update(speculation_ctrl_update_tif(current)); + preempt_enable(); +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev, *next; @@ -482,7 +502,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); - __speculation_ctrl_update(tifp, tifn); + if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { + __speculation_ctrl_update(tifp, tifn); + } else { + speculation_ctrl_update_tif(prev_p); + tifn = speculation_ctrl_update_tif(next_p); + + /* Enforce MSR update to ensure consistent state */ + __speculation_ctrl_update(~tifn, tifn); + } } /* -- cgit v1.2.3-55-g7522 From 6893a959d7fdebbab5f5aa112c277d5a44435ba1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:52 +0100 Subject: x86/speculation: Prepare arch_smt_update() for PRCTL mode The upcoming fine grained per task STIBP control needs to be updated on CPU hotplug as well. Split out the code which controls the strict mode so the prctl control code can be added later. Mark the SMP function call argument __unused while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.759457117@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 29f40a92f5a8..9cab538e10f1 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -530,40 +530,44 @@ specv2_set_mode: arch_smt_update(); } -static bool stibp_needed(void) +static void update_stibp_msr(void * __unused) { - /* Enhanced IBRS makes using STIBP unnecessary. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return false; - - /* Check for strict user mitigation mode */ - return spectre_v2_user == SPECTRE_V2_USER_STRICT; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); } -static void update_stibp_msr(void *info) +/* Update x86_spec_ctrl_base in case SMT state changed. */ +static void update_stibp_strict(void) { - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + + if (sched_smt_active()) + mask |= SPEC_CTRL_STIBP; + + if (mask == x86_spec_ctrl_base) + return; + + pr_info("Update user space SMT mitigation: STIBP %s\n", + mask & SPEC_CTRL_STIBP ? "always-on" : "off"); + x86_spec_ctrl_base = mask; + on_each_cpu(update_stibp_msr, NULL, 1); } void arch_smt_update(void) { - u64 mask; - - if (!stibp_needed()) + /* Enhanced IBRS implies STIBP. No update required. */ + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; mutex_lock(&spec_ctrl_mutex); - mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; - if (sched_smt_active()) - mask |= SPEC_CTRL_STIBP; - - if (mask != x86_spec_ctrl_base) { - pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", - mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); - x86_spec_ctrl_base = mask; - on_each_cpu(update_stibp_msr, NULL, 1); + switch (spectre_v2_user) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + update_stibp_strict(); + break; } + mutex_unlock(&spec_ctrl_mutex); } -- cgit v1.2.3-55-g7522 From 9137bb27e60e554dab694eafa4cca241fa3a694f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:53 +0100 Subject: x86/speculation: Add prctl() control for indirect branch speculation Add the PR_SPEC_INDIRECT_BRANCH option for the PR_GET_SPECULATION_CTRL and PR_SET_SPECULATION_CTRL prctls to allow fine grained per task control of indirect branch speculation via STIBP and IBPB. Invocations: Check indirect branch speculation status with - prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0); Enable indirect branch speculation with - prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); Disable indirect branch speculation with - prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); Force disable indirect branch speculation with - prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); See Documentation/userspace-api/spec_ctrl.rst. Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.866780996@linutronix.de --- Documentation/userspace-api/spec_ctrl.rst | 9 +++++ arch/x86/include/asm/nospec-branch.h | 1 + arch/x86/kernel/cpu/bugs.c | 67 +++++++++++++++++++++++++++++++ arch/x86/kernel/process.c | 5 +++ include/linux/sched.h | 9 +++++ include/uapi/linux/prctl.h | 1 + tools/include/uapi/linux/prctl.h | 1 + 7 files changed, 93 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index 32f3d55c54b7..c4dbe6f7cdae 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -92,3 +92,12 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); + +- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes + (Mitigate Spectre V2 style attacks against user processes) + + Invocations: + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d4d35baf0430..2adbe7b047fa 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -232,6 +232,7 @@ enum spectre_v2_mitigation { enum spectre_v2_user_mitigation { SPECTRE_V2_USER_NONE, SPECTRE_V2_USER_STRICT, + SPECTRE_V2_USER_PRCTL, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9cab538e10f1..74359fff87fd 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -566,6 +566,8 @@ void arch_smt_update(void) case SPECTRE_V2_USER_STRICT: update_stibp_strict(); break; + case SPECTRE_V2_USER_PRCTL: + break; } mutex_unlock(&spec_ctrl_mutex); @@ -752,12 +754,50 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; } +static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + switch (ctrl) { + case PR_SPEC_ENABLE: + if (spectre_v2_user == SPECTRE_V2_USER_NONE) + return 0; + /* + * Indirect branch speculation is always disabled in strict + * mode. + */ + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + return -EPERM; + task_clear_spec_ib_disable(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE: + case PR_SPEC_FORCE_DISABLE: + /* + * Indirect branch speculation is always allowed when + * mitigation is force disabled. + */ + if (spectre_v2_user == SPECTRE_V2_USER_NONE) + return -EPERM; + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + return 0; + task_set_spec_ib_disable(task); + if (ctrl == PR_SPEC_FORCE_DISABLE) + task_set_spec_ib_force_disable(task); + task_update_spec_tif(task); + break; + default: + return -ERANGE; + } + return 0; +} + int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, unsigned long ctrl) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_set(task, ctrl); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_set(task, ctrl); default: return -ENODEV; } @@ -790,11 +830,34 @@ static int ssb_prctl_get(struct task_struct *task) } } +static int ib_prctl_get(struct task_struct *task) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return PR_SPEC_NOT_AFFECTED; + + switch (spectre_v2_user) { + case SPECTRE_V2_USER_NONE: + return PR_SPEC_ENABLE; + case SPECTRE_V2_USER_PRCTL: + if (task_spec_ib_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ib_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + case SPECTRE_V2_USER_STRICT: + return PR_SPEC_DISABLE; + default: + return PR_SPEC_NOT_AFFECTED; + } +} + int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_get(task); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_get(task); default: return -ENODEV; } @@ -974,6 +1037,8 @@ static char *stibp_state(void) return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: return ", STIBP: forced"; + case SPECTRE_V2_USER_PRCTL: + return ""; } return ""; } @@ -986,6 +1051,8 @@ static char *ibpb_state(void) return ", IBPB: disabled"; case SPECTRE_V2_USER_STRICT: return ", IBPB: always-on"; + case SPECTRE_V2_USER_PRCTL: + return ""; } } return ""; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index afbe2eb4a1c6..7d31192296a8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -450,6 +450,11 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_SSBD); else clear_tsk_thread_flag(tsk, TIF_SSBD); + + if (task_spec_ib_disable(tsk)) + set_tsk_thread_flag(tsk, TIF_SPEC_IB); + else + clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ return task_thread_info(tsk)->flags; diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..d607db5fcc6a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1453,6 +1453,8 @@ static inline bool is_percpu_thread(void) #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ +#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ +#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ @@ -1484,6 +1486,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) +TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) + +TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) +TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) + static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index c0d7ea0bf5b6..b17201edfa09 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -212,6 +212,7 @@ struct prctl_mm_map { #define PR_SET_SPECULATION_CTRL 53 /* Speculation control variants */ # define PR_SPEC_STORE_BYPASS 0 +# define PR_SPEC_INDIRECT_BRANCH 1 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ # define PR_SPEC_NOT_AFFECTED 0 # define PR_SPEC_PRCTL (1UL << 0) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index c0d7ea0bf5b6..b17201edfa09 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -212,6 +212,7 @@ struct prctl_mm_map { #define PR_SET_SPECULATION_CTRL 53 /* Speculation control variants */ # define PR_SPEC_STORE_BYPASS 0 +# define PR_SPEC_INDIRECT_BRANCH 1 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ # define PR_SPEC_NOT_AFFECTED 0 # define PR_SPEC_PRCTL (1UL << 0) -- cgit v1.2.3-55-g7522 From 7cc765a67d8e04ef7d772425ca5a2a1e2b894c15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:54 +0100 Subject: x86/speculation: Enable prctl mode for spectre_v2_user Now that all prerequisites are in place: - Add the prctl command line option - Default the 'auto' mode to 'prctl' - When SMT state changes, update the static key which controls the conditional STIBP evaluation on context switch. - At init update the static key which controls the conditional IBPB evaluation on context switch. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.958421388@linutronix.de --- Documentation/admin-guide/kernel-parameters.txt | 7 ++++- arch/x86/kernel/cpu/bugs.c | 41 +++++++++++++++++++------ 2 files changed, 38 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b6e5b33b9d75..a9b98a4e8789 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4236,9 +4236,14 @@ off - Unconditionally disable mitigations. Is enforced by spectre_v2=off + prctl - Indirect branch speculation is enabled, + but mitigation can be enabled via prctl + per thread. The mitigation control state + is inherited on fork. + auto - Kernel selects the mitigation depending on the available CPU features and vulnerability. - Default is off. + Default is prctl. Not specifying this option is equivalent to spectre_v2_user=auto. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 74359fff87fd..d0137d10f9a6 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -255,11 +255,13 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, + SPECTRE_V2_USER_CMD_PRCTL, }; static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", }; static const struct { @@ -270,6 +272,7 @@ static const struct { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, + { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, }; static void __init spec_v2_user_print_cond(const char *reason, bool secure) @@ -324,12 +327,15 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) smt_possible = false; switch (spectre_v2_parse_user_cmdline(v2_cmd)) { - case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_NONE: goto set_mode; case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_PRCTL: + mode = SPECTRE_V2_USER_PRCTL; + break; } /* Initialize Indirect Branch Prediction Barrier */ @@ -340,6 +346,9 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) case SPECTRE_V2_USER_STRICT: static_branch_enable(&switch_mm_always_ibpb); break; + case SPECTRE_V2_USER_PRCTL: + static_branch_enable(&switch_mm_cond_ibpb); + break; default: break; } @@ -352,6 +361,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; + /* + * If SMT is not possible or STIBP is not available clear the STIPB + * mode. + */ + if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) + mode = SPECTRE_V2_USER_NONE; set_mode: spectre_v2_user = mode; /* Only print the STIBP mode when SMT possible */ @@ -552,6 +567,15 @@ static void update_stibp_strict(void) on_each_cpu(update_stibp_msr, NULL, 1); } +/* Update the static key controlling the evaluation of TIF_SPEC_IB */ +static void update_indir_branch_cond(void) +{ + if (sched_smt_active()) + static_branch_enable(&switch_to_cond_stibp); + else + static_branch_disable(&switch_to_cond_stibp); +} + void arch_smt_update(void) { /* Enhanced IBRS implies STIBP. No update required. */ @@ -567,6 +591,7 @@ void arch_smt_update(void) update_stibp_strict(); break; case SPECTRE_V2_USER_PRCTL: + update_indir_branch_cond(); break; } @@ -1038,7 +1063,8 @@ static char *stibp_state(void) case SPECTRE_V2_USER_STRICT: return ", STIBP: forced"; case SPECTRE_V2_USER_PRCTL: - return ""; + if (static_key_enabled(&switch_to_cond_stibp)) + return ", STIBP: conditional"; } return ""; } @@ -1046,14 +1072,11 @@ static char *stibp_state(void) static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { - switch (spectre_v2_user) { - case SPECTRE_V2_USER_NONE: - return ", IBPB: disabled"; - case SPECTRE_V2_USER_STRICT: + if (static_key_enabled(&switch_mm_always_ibpb)) return ", IBPB: always-on"; - case SPECTRE_V2_USER_PRCTL: - return ""; - } + if (static_key_enabled(&switch_mm_cond_ibpb)) + return ", IBPB: conditional"; + return ", IBPB: disabled"; } return ""; } -- cgit v1.2.3-55-g7522 From 6b3e64c237c072797a9ec918654a60e3a46488e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:55 +0100 Subject: x86/speculation: Add seccomp Spectre v2 user space protection mode If 'prctl' mode of user space protection from spectre v2 is selected on the kernel command-line, STIBP and IBPB are applied on tasks which restrict their indirect branch speculation via prctl. SECCOMP enables the SSBD mitigation for sandboxed tasks already, so it makes sense to prevent spectre v2 user space to user space attacks as well. The Intel mitigation guide documents how STIPB works: Setting bit 1 (STIBP) of the IA32_SPEC_CTRL MSR on a logical processor prevents the predicted targets of indirect branches on any logical processor of that core from being controlled by software that executes (or executed previously) on another logical processor of the same core. Ergo setting STIBP protects the task itself from being attacked from a task running on a different hyper-thread and protects the tasks running on different hyper-threads from being attacked. While the document suggests that the branch predictors are shielded between the logical processors, the observed performance regressions suggest that STIBP simply disables the branch predictor more or less completely. Of course the document wording is vague, but the fact that there is also no requirement for issuing IBPB when STIBP is used points clearly in that direction. The kernel still issues IBPB even when STIBP is used until Intel clarifies the whole mechanism. IBPB is issued when the task switches out, so malicious sandbox code cannot mistrain the branch predictor for the next user space task on the same logical processor. Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185006.051663132@linutronix.de --- Documentation/admin-guide/kernel-parameters.txt | 9 ++++++++- arch/x86/include/asm/nospec-branch.h | 1 + arch/x86/kernel/cpu/bugs.c | 17 ++++++++++++++++- 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a9b98a4e8789..f405281bb202 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4241,9 +4241,16 @@ per thread. The mitigation control state is inherited on fork. + seccomp + - Same as "prctl" above, but all seccomp + threads will enable the mitigation unless + they explicitly opt out. + auto - Kernel selects the mitigation depending on the available CPU features and vulnerability. - Default is prctl. + + Default mitigation: + If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl" Not specifying this option is equivalent to spectre_v2_user=auto. diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 2adbe7b047fa..032b6009baab 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -233,6 +233,7 @@ enum spectre_v2_user_mitigation { SPECTRE_V2_USER_NONE, SPECTRE_V2_USER_STRICT, SPECTRE_V2_USER_PRCTL, + SPECTRE_V2_USER_SECCOMP, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d0137d10f9a6..c9e304960534 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -256,12 +256,14 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, SPECTRE_V2_USER_CMD_PRCTL, + SPECTRE_V2_USER_CMD_SECCOMP, }; static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; static const struct { @@ -273,6 +275,7 @@ static const struct { { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, + { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, }; static void __init spec_v2_user_print_cond(const char *reason, bool secure) @@ -332,10 +335,16 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; - case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: mode = SPECTRE_V2_USER_PRCTL; break; + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_SECCOMP: + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPECTRE_V2_USER_SECCOMP; + else + mode = SPECTRE_V2_USER_PRCTL; + break; } /* Initialize Indirect Branch Prediction Barrier */ @@ -347,6 +356,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) static_branch_enable(&switch_mm_always_ibpb); break; case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; default: @@ -591,6 +601,7 @@ void arch_smt_update(void) update_stibp_strict(); break; case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: update_indir_branch_cond(); break; } @@ -833,6 +844,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); + if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) + ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif @@ -864,6 +877,7 @@ static int ib_prctl_get(struct task_struct *task) case SPECTRE_V2_USER_NONE: return PR_SPEC_ENABLE; case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) @@ -1063,6 +1077,7 @@ static char *stibp_state(void) case SPECTRE_V2_USER_STRICT: return ", STIBP: forced"; case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) return ", STIBP: conditional"; } -- cgit v1.2.3-55-g7522 From 55a974021ec952ee460dc31ca08722158639de72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:56 +0100 Subject: x86/speculation: Provide IBPB always command line options Provide the possibility to enable IBPB always in combination with 'prctl' and 'seccomp'. Add the extra command line options and rework the IBPB selection to evaluate the command instead of the mode selected by the STIPB switch case. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185006.144047038@linutronix.de --- Documentation/admin-guide/kernel-parameters.txt | 12 +++++++++ arch/x86/kernel/cpu/bugs.c | 34 +++++++++++++++++-------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f405281bb202..05a252e5178d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4241,11 +4241,23 @@ per thread. The mitigation control state is inherited on fork. + prctl,ibpb + - Like "prctl" above, but only STIBP is + controlled per thread. IBPB is issued + always when switching between different user + space processes. + seccomp - Same as "prctl" above, but all seccomp threads will enable the mitigation unless they explicitly opt out. + seccomp,ibpb + - Like "seccomp" above, but only STIBP is + controlled per thread. IBPB is issued + always when switching between different + user space processes. + auto - Kernel selects the mitigation depending on the available CPU features and vulnerability. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c9e304960534..500278f5308e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -256,7 +256,9 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, SPECTRE_V2_USER_CMD_PRCTL, + SPECTRE_V2_USER_CMD_PRCTL_IBPB, SPECTRE_V2_USER_CMD_SECCOMP, + SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; static const char * const spectre_v2_user_strings[] = { @@ -271,11 +273,13 @@ static const struct { enum spectre_v2_user_cmd cmd; bool secure; } v2_user_options[] __initdata = { - { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, - { "off", SPECTRE_V2_USER_CMD_NONE, false }, - { "on", SPECTRE_V2_USER_CMD_FORCE, true }, - { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, - { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, + { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, + { "off", SPECTRE_V2_USER_CMD_NONE, false }, + { "on", SPECTRE_V2_USER_CMD_FORCE, true }, + { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, + { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, + { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, + { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, }; static void __init spec_v2_user_print_cond(const char *reason, bool secure) @@ -321,6 +325,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); + enum spectre_v2_user_cmd cmd; if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; @@ -329,17 +334,20 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; - switch (spectre_v2_parse_user_cmdline(v2_cmd)) { + cmd = spectre_v2_parse_user_cmdline(v2_cmd); + switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_PRCTL: + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: mode = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_SECCOMP: + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: if (IS_ENABLED(CONFIG_SECCOMP)) mode = SPECTRE_V2_USER_SECCOMP; else @@ -351,12 +359,15 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); - switch (mode) { - case SPECTRE_V2_USER_STRICT: + switch (cmd) { + case SPECTRE_V2_USER_CMD_FORCE: + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: static_branch_enable(&switch_mm_always_ibpb); break; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: + case SPECTRE_V2_USER_CMD_PRCTL: + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; default: @@ -364,7 +375,8 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", - mode == SPECTRE_V2_USER_STRICT ? "always-on" : "conditional"); + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); } /* If enhanced IBRS is enabled no STIPB required */ -- cgit v1.2.3-55-g7522 From a7b403104e17209ea71eea59d4a71bf9e0d8cb83 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 23 Nov 2018 17:24:51 +0100 Subject: xen/x86: add diagnostic printout to xen_mc_flush() in case of error Failure of an element of a Xen multicall is signalled via a WARN() only if the kernel is compiled with MC_DEBUG. It is impossible to know which element failed and why it did so. Change that by printing the related information even without MC_DEBUG, even if maybe in some limited form (e.g. without information which caller produced the failing element). Move the printing out of the switch statement in order to have the same information for a single call. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- arch/x86/xen/multicalls.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 2bce7958ce8b..0766a08bdf45 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -69,6 +69,11 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); +#if MC_DEBUG + memcpy(b->debug, b->entries, + b->mcidx * sizeof(struct multicall_entry)); +#endif + switch (b->mcidx) { case 0: /* no-op */ @@ -87,32 +92,34 @@ void xen_mc_flush(void) break; default: -#if MC_DEBUG - memcpy(b->debug, b->entries, - b->mcidx * sizeof(struct multicall_entry)); -#endif - if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) BUG(); for (i = 0; i < b->mcidx; i++) if (b->entries[i].result < 0) ret++; + } + if (WARN_ON(ret)) { + pr_err("%d of %d multicall(s) failed: cpu %d\n", + ret, b->mcidx, smp_processor_id()); + for (i = 0; i < b->mcidx; i++) { + if (b->entries[i].result < 0) { #if MC_DEBUG - if (ret) { - printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", - ret, smp_processor_id()); - dump_stack(); - for (i = 0; i < b->mcidx; i++) { - printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n", - i+1, b->mcidx, + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n", + i + 1, b->debug[i].op, b->debug[i].args[0], b->entries[i].result, b->caller[i]); +#else + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n", + i + 1, + b->entries[i].op, + b->entries[i].args[0], + b->entries[i].result); +#endif } } -#endif } b->mcidx = 0; @@ -126,8 +133,6 @@ void xen_mc_flush(void) b->cbidx = 0; local_irq_restore(flags); - - WARN_ON(ret); } struct multicall_space __xen_mc_entry(size_t args) -- cgit v1.2.3-55-g7522 From 123664101aa2156d05251704fc63f9bcbf77741a Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Tue, 27 Nov 2018 20:58:21 +0000 Subject: Revert "xen/balloon: Mark unallocated host memory as UNUSABLE" This reverts commit b3cf8528bb21febb650a7ecbf080d0647be40b9f. That commit unintentionally broke Xen balloon memory hotplug with "hotplug_unpopulated" set to 1. As long as "System RAM" resource got assigned under a new "Unusable memory" resource in IO/Mem tree any attempt to online this memory would fail due to general kernel restrictions on having "System RAM" resources as 1st level only. The original issue that commit has tried to workaround fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") also got amended by the following 03a551734 ("x86/PCI: Move and shrink AMD 64-bit window to avoid conflict") which made the original fix to Xen ballooning unnecessary. Signed-off-by: Igor Druzhinin Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten.c | 78 ------------------------------------------------ arch/x86/xen/setup.c | 6 ++-- drivers/xen/balloon.c | 65 ++++++---------------------------------- include/xen/balloon.h | 5 ---- 4 files changed, 13 insertions(+), 141 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 67b2f31a1265..aa1cc483bd2a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -346,80 +345,3 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif - -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -void __init arch_xen_balloon_init(struct resource *hostmem_resource) -{ - struct xen_memory_map memmap; - int rc; - unsigned int i, last_guest_ram; - phys_addr_t max_addr = PFN_PHYS(max_pfn); - struct e820_table *xen_e820_table; - const struct e820_entry *entry; - struct resource *res; - - if (!xen_initial_domain()) - return; - - xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL); - if (!xen_e820_table) - return; - - memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries); - set_xen_guest_handle(memmap.buffer, xen_e820_table->entries); - rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap); - if (rc) { - pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc); - goto out; - } - - last_guest_ram = 0; - for (i = 0; i < memmap.nr_entries; i++) { - if (xen_e820_table->entries[i].addr >= max_addr) - break; - if (xen_e820_table->entries[i].type == E820_TYPE_RAM) - last_guest_ram = i; - } - - entry = &xen_e820_table->entries[last_guest_ram]; - if (max_addr >= entry->addr + entry->size) - goto out; /* No unallocated host RAM. */ - - hostmem_resource->start = max_addr; - hostmem_resource->end = entry->addr + entry->size; - - /* - * Mark non-RAM regions between the end of dom0 RAM and end of host RAM - * as unavailable. The rest of that region can be used for hotplug-based - * ballooning. - */ - for (; i < memmap.nr_entries; i++) { - entry = &xen_e820_table->entries[i]; - - if (entry->type == E820_TYPE_RAM) - continue; - - if (entry->addr >= hostmem_resource->end) - break; - - res = kzalloc(sizeof(*res), GFP_KERNEL); - if (!res) - goto out; - - res->name = "Unavailable host RAM"; - res->start = entry->addr; - res->end = (entry->addr + entry->size < hostmem_resource->end) ? - entry->addr + entry->size : hostmem_resource->end; - rc = insert_resource(hostmem_resource, res); - if (rc) { - pr_warn("%s: Can't insert [%llx - %llx) (%d)\n", - __func__, res->start, res->end, rc); - kfree(res); - goto out; - } - } - - out: - kfree(xen_e820_table); -} -#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1163e33121fb..075ed47993bb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -808,6 +808,7 @@ char * __init xen_memory_setup(void) addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; while (i < xen_e820_table.nr_entries) { + bool discard = false; chunk_size = size; type = xen_e820_table.entries[i].type; @@ -823,10 +824,11 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - type = E820_TYPE_UNUSABLE; + discard = true; } - xen_align_and_add_e820_region(addr, chunk_size, type); + if (!discard) + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index e12bb256036f..7ab6caef599c 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -251,25 +251,10 @@ static void release_memory_resource(struct resource *resource) kfree(resource); } -/* - * Host memory not allocated to dom0. We can use this range for hotplug-based - * ballooning. - * - * It's a type-less resource. Setting IORESOURCE_MEM will make resource - * management algorithms (arch_remove_reservations()) look into guest e820, - * which we don't want. - */ -static struct resource hostmem_resource = { - .name = "Host RAM", -}; - -void __attribute__((weak)) __init arch_xen_balloon_init(struct resource *res) -{} - static struct resource *additional_memory_resource(phys_addr_t size) { - struct resource *res, *res_hostmem; - int ret = -ENOMEM; + struct resource *res; + int ret; res = kzalloc(sizeof(*res), GFP_KERNEL); if (!res) @@ -278,42 +263,13 @@ static struct resource *additional_memory_resource(phys_addr_t size) res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - res_hostmem = kzalloc(sizeof(*res), GFP_KERNEL); - if (res_hostmem) { - /* Try to grab a range from hostmem */ - res_hostmem->name = "Host memory"; - ret = allocate_resource(&hostmem_resource, res_hostmem, - size, 0, -1, - PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); - } - - if (!ret) { - /* - * Insert this resource into iomem. Because hostmem_resource - * tracks portion of guest e820 marked as UNUSABLE noone else - * should try to use it. - */ - res->start = res_hostmem->start; - res->end = res_hostmem->end; - ret = insert_resource(&iomem_resource, res); - if (ret < 0) { - pr_err("Can't insert iomem_resource [%llx - %llx]\n", - res->start, res->end); - release_memory_resource(res_hostmem); - res_hostmem = NULL; - res->start = res->end = 0; - } - } - - if (ret) { - ret = allocate_resource(&iomem_resource, res, - size, 0, -1, - PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); - if (ret < 0) { - pr_err("Cannot allocate new System RAM resource\n"); - kfree(res); - return NULL; - } + ret = allocate_resource(&iomem_resource, res, + size, 0, -1, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + if (ret < 0) { + pr_err("Cannot allocate new System RAM resource\n"); + kfree(res); + return NULL; } #ifdef CONFIG_SPARSEMEM @@ -325,7 +281,6 @@ static struct resource *additional_memory_resource(phys_addr_t size) pr_err("New System RAM resource outside addressable RAM (%lu > %lu)\n", pfn, limit); release_memory_resource(res); - release_memory_resource(res_hostmem); return NULL; } } @@ -747,8 +702,6 @@ static int __init balloon_init(void) set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); register_sysctl_table(xen_root); - - arch_xen_balloon_init(&hostmem_resource); #endif #ifdef CONFIG_XEN_PV diff --git a/include/xen/balloon.h b/include/xen/balloon.h index 61f410fd74e4..4914b93a23f2 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -44,8 +44,3 @@ static inline void xen_balloon_init(void) { } #endif - -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -struct resource; -void arch_xen_balloon_init(struct resource *hostmem_resource); -#endif -- cgit v1.2.3-55-g7522 From 182ddd16194cd082f25fa1b063dae3c7c5cce384 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 3 Dec 2018 11:38:11 +0100 Subject: x86/boot: Clear RSDP address in boot_params for broken loaders Gunnar Krueger reported a systemd-boot failure and bisected it down to: e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address from boot params if available") In case a broken boot loader doesn't clear its 'struct boot_params', clear rsdp_addr in sanitize_boot_params(). Reported-by: Gunnar Krueger Tested-by: Gunnar Krueger Signed-off-by: Juergen Gross Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@alien8.de Cc: sstabellini@kernel.org Fixes: e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address from boot params if available") Link: http://lkml.kernel.org/r/20181203103811.17056-1-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bootparam_utils.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index a07ffd23e4dd..f6f6ef436599 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -36,6 +36,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) */ if (boot_params->sentinel) { /* fields in boot_params are left uninitialized, clear them */ + boot_params->acpi_rsdp_addr = 0; memset(&boot_params->ext_ramdisk_image, 0, (char *)&boot_params->efi_info - (char *)&boot_params->ext_ramdisk_image); -- cgit v1.2.3-55-g7522 From 25896d073d8a0403b07e6dec56f58e6c33678207 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 5 Dec 2018 15:27:19 +0900 Subject: x86/build: Fix compiler support check for CONFIG_RETPOLINE It is troublesome to add a diagnostic like this to the Makefile parse stage because the top-level Makefile could be parsed with a stale include/config/auto.conf. Once you are hit by the error about non-retpoline compiler, the compilation still breaks even after disabling CONFIG_RETPOLINE. The easiest fix is to move this check to the "archprepare" like this commit did: 829fe4aa9ac1 ("x86: Allow generating user-space headers without a compiler") Reported-by: Meelis Roos Tested-by: Meelis Roos Signed-off-by: Masahiro Yamada Acked-by: Zhenzhong Duan Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zhenzhong Duan Fixes: 4cd24de3a098 ("x86/retpoline: Make CONFIG_RETPOLINE depend on compiler support") Link: http://lkml.kernel.org/r/1543991239-18476-1-git-send-email-yamada.masahiro@socionext.com Link: https://lkml.org/lkml/2018/12/4/206 Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f5d7f4134524..75ef499a66e2 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -220,9 +220,6 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre ifdef CONFIG_RETPOLINE -ifeq ($(RETPOLINE_CFLAGS),) - $(error You are building kernel with non-retpoline compiler, please update your compiler.) -endif KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) endif @@ -307,6 +304,13 @@ ifndef CC_HAVE_ASM_GOTO @echo Compiler lacks asm-goto support. @exit 1 endif +ifdef CONFIG_RETPOLINE +ifeq ($(RETPOLINE_CFLAGS),) + @echo "You are building kernel with non-retpoline compiler." >&2 + @echo "Please update your compiler." >&2 + @false +endif +endif archclean: $(Q)rm -rf $(objtree)/arch/i386 -- cgit v1.2.3-55-g7522 From ac3e233d29f7f77f28243af0132057d378d3ea58 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 6 Dec 2018 11:12:31 -0800 Subject: x86/vdso: Drop implicit common-page-size linker flag GNU linker's -z common-page-size's default value is based on the target architecture. arch/x86/entry/vdso/Makefile sets it to the architecture default, which is implicit and redundant. Drop it. Fixes: 2aae950b21e4 ("x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu") Reported-by: Dmitry Golovin Reported-by: Bill Wendling Suggested-by: Dmitry Golovin Suggested-by: Rui Ueyama Signed-off-by: Nick Desaulniers Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andi Kleen Cc: Fangrui Song Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20181206191231.192355-1-ndesaulniers@google.com Link: https://bugs.llvm.org/show_bug.cgi?id=38774 Link: https://github.com/ClangBuiltLinux/linux/issues/31 --- arch/x86/entry/vdso/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 141d415a8c80..0624bf2266fd 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -47,7 +47,7 @@ targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) CPPFLAGS_vdso.lds += -P -C VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ - -z max-page-size=4096 -z common-page-size=4096 + -z max-page-size=4096 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) @@ -98,7 +98,7 @@ CFLAGS_REMOVE_vvar.o = -pg CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ - -z max-page-size=4096 -z common-page-size=4096 + -z max-page-size=4096 # x32-rebranded versions vobjx32s-y := $(vobjs-y:.o=-x32.o) -- cgit v1.2.3-55-g7522 From 16877a5570e0c5f4270d5b17f9bab427bcae9514 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Fri, 30 Nov 2018 23:23:27 +0300 Subject: x86/mm: Fix guard hole handling There is a guard hole at the beginning of the kernel address space, also used by hypervisors. It occupies 16 PGD entries. This reserved range is not defined explicitely, it is calculated relative to other entities: direct mapping and user space ranges. The calculation got broken by recent changes of the kernel memory layout: LDT remap range is now mapped before direct mapping and makes the calculation invalid. The breakage leads to crash on Xen dom0 boot[1]. Define the reserved range explicitely. It's part of kernel ABI (hypervisors expect it to be stable) and must not depend on changes in the rest of kernel memory layout. [1] https://lists.xenproject.org/archives/html/xen-devel/2018-11/msg03313.html Fixes: d52888aa2753 ("x86/mm: Move LDT remap out of KASLR region on 5-level paging") Reported-by: Hans van Kranenburg Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Tested-by: Hans van Kranenburg Reviewed-by: Juergen Gross Cc: bp@alien8.de Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: peterz@infradead.org Cc: boris.ostrovsky@oracle.com Cc: bhe@redhat.com Cc: linux-mm@kvack.org Cc: xen-devel@lists.xenproject.org Link: https://lkml.kernel.org/r/20181130202328.65359-2-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/pgtable_64_types.h | 5 +++++ arch/x86/mm/dump_pagetables.c | 8 ++++---- arch/x86/xen/mmu_pv.c | 11 ++++++----- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 84bd9bdc1987..88bca456da99 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -111,6 +111,11 @@ extern unsigned int ptrs_per_p4d; */ #define MAXMEM (1UL << MAX_PHYSMEM_BITS) +#define GUARD_HOLE_PGD_ENTRY -256UL +#define GUARD_HOLE_SIZE (16UL << PGDIR_SHIFT) +#define GUARD_HOLE_BASE_ADDR (GUARD_HOLE_PGD_ENTRY << PGDIR_SHIFT) +#define GUARD_HOLE_END_ADDR (GUARD_HOLE_BASE_ADDR + GUARD_HOLE_SIZE) + #define LDT_PGD_ENTRY -240UL #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index fc37bbd23eb8..dad153e5a427 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -512,11 +512,11 @@ static inline bool is_hypervisor_range(int idx) { #ifdef CONFIG_X86_64 /* - * ffff800000000000 - ffff87ffffffffff is reserved for - * the hypervisor. + * A hole in the beginning of kernel address space reserved + * for a hypervisor. */ - return (idx >= pgd_index(__PAGE_OFFSET) - 16) && - (idx < pgd_index(__PAGE_OFFSET)); + return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) && + (idx < pgd_index(GUARD_HOLE_END_ADDR)); #else return false; #endif diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index a5d7ed125337..0f4fe206dcc2 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -648,19 +648,20 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, unsigned long limit) { int i, nr, flush = 0; - unsigned hole_low, hole_high; + unsigned hole_low = 0, hole_high = 0; /* The limit is the last byte to be touched */ limit--; BUG_ON(limit >= FIXADDR_TOP); +#ifdef CONFIG_X86_64 /* * 64-bit has a great big hole in the middle of the address - * space, which contains the Xen mappings. On 32-bit these - * will end up making a zero-sized hole and so is a no-op. + * space, which contains the Xen mappings. */ - hole_low = pgd_index(USER_LIMIT); - hole_high = pgd_index(PAGE_OFFSET); + hole_low = pgd_index(GUARD_HOLE_BASE_ADDR); + hole_high = pgd_index(GUARD_HOLE_END_ADDR); +#endif nr = pgd_index(limit) + 1; for (i = 0; i < nr; i++) { -- cgit v1.2.3-55-g7522 From 254eb5505ca0ca749d3a491fc6668b6c16647a99 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Fri, 30 Nov 2018 23:23:28 +0300 Subject: x86/dump_pagetables: Fix LDT remap address marker The LDT remap placement has been changed. It's now placed before the direct mapping in the kernel virtual address space for both paging modes. Change address markers order accordingly. Fixes: d52888aa2753 ("x86/mm: Move LDT remap out of KASLR region on 5-level paging") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: bp@alien8.de Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: peterz@infradead.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: bhe@redhat.com Cc: hans.van.kranenburg@mendix.com Cc: linux-mm@kvack.org Cc: xen-devel@lists.xenproject.org Link: https://lkml.kernel.org/r/20181130202328.65359-3-kirill.shutemov@linux.intel.com --- arch/x86/mm/dump_pagetables.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index dad153e5a427..abcb8d00b014 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -55,10 +55,10 @@ struct addr_marker { enum address_markers_idx { USER_SPACE_NR = 0, KERNEL_SPACE_NR, - LOW_KERNEL_NR, -#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) +#ifdef CONFIG_MODIFY_LDT_SYSCALL LDT_NR, #endif + LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, #ifdef CONFIG_KASAN @@ -66,9 +66,6 @@ enum address_markers_idx { KASAN_SHADOW_END_NR, #endif CPU_ENTRY_AREA_NR, -#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) - LDT_NR, -#endif #ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, #endif -- cgit v1.2.3-55-g7522 From 80b71c340f17705ec145911b9a193ea781811b16 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 10 Dec 2018 13:21:54 -0800 Subject: x86/intel_rdt: Ensure a CPU remains online for the region's pseudo-locking sequence The user triggers the creation of a pseudo-locked region when writing the requested schemata to the schemata resctrl file. The pseudo-locking of a region is required to be done on a CPU that is associated with the cache on which the pseudo-locked region will reside. In order to run the locking code on a specific CPU, the needed CPU has to be selected and ensured to remain online during the entire locking sequence. At this time, the cpu_hotplug_lock is not taken during the pseudo-lock region creation and it is thus possible for a CPU to be selected to run the pseudo-locking code and then that CPU to go offline before the thread is able to run on it. Fix this by ensuring that the cpu_hotplug_lock is taken while the CPU on which code has to run needs to be controlled. Since the cpu_hotplug_lock is always taken before rdtgroup_mutex the lock order is maintained. Fixes: e0bdfe8e36f3 ("x86/intel_rdt: Support creation/removal of pseudo-locked region") Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Fenghua Yu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Tony Luck Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: stable Cc: x86-ml Link: https://lkml.kernel.org/r/b7b17432a80f95a1fa21a1698ba643014f58ad31.1544476425.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 27937458c231..efa4a519f5e5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -23,6 +23,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -310,9 +311,11 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return -EINVAL; buf[nbytes - 1] = '\0'; + cpus_read_lock(); rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); + cpus_read_unlock(); return -ENOENT; } rdt_last_cmd_clear(); @@ -367,6 +370,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdtgroup_kn_unlock(of->kn); + cpus_read_unlock(); return ret ?: nbytes; } -- cgit v1.2.3-55-g7522 From 51c3fbd89d7554caa3290837604309f8d8669d99 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 11 Dec 2018 07:49:39 -0800 Subject: x86/mm: Fix decoy address handling vs 32-bit builds A decoy address is used by set_mce_nospec() to update the cache attributes for a page that may contain poison (multi-bit ECC error) while attempting to minimize the possibility of triggering a speculative access to that page. When reserve_memtype() is handling a decoy address it needs to convert it to its real physical alias. The conversion, AND'ing with __PHYSICAL_MASK, is broken for a 32-bit physical mask and reserve_memtype() is passed the last physical page. Gert reports triggering the: BUG_ON(start >= end); ...assertion when running a 32-bit non-PAE build on a platform that has a driver resource at the top of physical memory: BIOS-e820: [mem 0x00000000fff00000-0x00000000ffffffff] reserved Given that the decoy address scheme is only targeted at 64-bit builds and assumes that the top of physical address space is free for use as a decoy address range, simply bypass address sanitization in the 32-bit case. Lastly, there was no need to crash the system when this failure occurred, and no need to crash future systems if the assumptions of decoy addresses are ever violated. Change the BUG_ON() to a WARN() with an error return. Fixes: 510ee090abc3 ("x86/mm/pat: Prepare {reserve, free}_memtype() for...") Reported-by: Gert Robben Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Tested-by: Gert Robben Cc: stable@vger.kernel.org Cc: Andy Shevchenko Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: platform-driver-x86@vger.kernel.org Cc: Link: https://lkml.kernel.org/r/154454337985.789277.12133288391664677775.stgit@dwillia2-desk3.amr.corp.intel.com --- arch/x86/mm/pat.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 08013524fba1..4fe956a63b25 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -519,8 +519,13 @@ static u64 sanitize_phys(u64 address) * for a "decoy" virtual address (bit 63 clear) passed to * set_memory_X(). __pa() on a "decoy" address results in a * physical address with bit 63 set. + * + * Decoy addresses are not present for 32-bit builds, see + * set_mce_nospec(). */ - return address & __PHYSICAL_MASK; + if (IS_ENABLED(CONFIG_X86_64)) + return address & __PHYSICAL_MASK; + return address; } /* @@ -546,7 +551,11 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, start = sanitize_phys(start); end = sanitize_phys(end); - BUG_ON(start >= end); /* end is exclusive */ + if (start >= end) { + WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, + start, end - 1, cattr_name(req_type)); + return -EINVAL; + } if (!pat_enabled()) { /* This is identical to page table setting without PAT */ -- cgit v1.2.3-55-g7522 From cd01544a268ad8ee5b1dfe42c4393f1095f86879 Mon Sep 17 00:00:00 2001 From: Alistair Strachan Date: Fri, 14 Dec 2018 14:36:37 -0800 Subject: x86/vdso: Pass --eh-frame-hdr to the linker Commit 379d98ddf413 ("x86: vdso: Use $LD instead of $CC to link") accidentally broke unwinding from userspace, because ld would strip the .eh_frame sections when linking. Originally, the compiler would implicitly add --eh-frame-hdr when invoking the linker, but when this Makefile was converted from invoking ld via the compiler, to invoking it directly (like vmlinux does), the flag was missed. (The EH_FRAME section is important for the VDSO shared libraries, but not for vmlinux.) Fix the problem by explicitly specifying --eh-frame-hdr, which restores parity with the old method. See relevant bug reports for additional info: https://bugzilla.kernel.org/show_bug.cgi?id=201741 https://bugzilla.redhat.com/show_bug.cgi?id=1659295 Fixes: 379d98ddf413 ("x86: vdso: Use $LD instead of $CC to link") Reported-by: Florian Weimer Reported-by: Carlos O'Donell Reported-by: "H. J. Lu" Signed-off-by: Alistair Strachan Signed-off-by: Borislav Petkov Tested-by: Laura Abbott Cc: Andy Lutomirski Cc: Carlos O'Donell Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joel Fernandes Cc: kernel-team@android.com Cc: Laura Abbott Cc: stable Cc: Thomas Gleixner Cc: X86 ML Link: https://lkml.kernel.org/r/20181214223637.35954-1-astrachan@google.com --- arch/x86/entry/vdso/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 0624bf2266fd..5bfe2243a08f 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -171,7 +171,8 @@ quiet_cmd_vdso = VDSO $@ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ - $(call ld-option, --build-id) -Bsymbolic + $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \ + -Bsymbolic GCOV_PROFILE := n # -- cgit v1.2.3-55-g7522 From 721066dfd4d5c0fee5772c777d6930d0f423b4eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Dec 2018 18:03:44 +0100 Subject: x86/mm/cpa: Fix cpa_flush_array() TLB invalidation In commit: a7295fd53c39 ("x86/mm/cpa: Use flush_tlb_kernel_range()") I misread the CAP array code and incorrectly used tlb_flush_kernel_range(), resulting in missing TLB flushes and consequent failures. Instead do a full invalidate in this case -- for now. Reported-by: StDenis, Tom Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: dave.hansen@intel.com Fixes: a7295fd53c39 ("x86/mm/cpa: Use flush_tlb_kernel_range()") Link: http://lkml.kernel.org/r/20181203171043.089868285@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index db7a10082238..a1bcde35db4c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,20 +285,16 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static bool __cpa_flush_range(unsigned long start, int numpages, int cache) +static bool __inv_flush_all(int cache) { BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - WARN_ON(PAGE_ALIGN(start) != start); - if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return true; } - flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - - return !cache; + return false; } static void cpa_flush_range(unsigned long start, int numpages, int cache) @@ -306,7 +302,14 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) unsigned int i, level; unsigned long addr; - if (__cpa_flush_range(start, numpages, cache)) + WARN_ON(PAGE_ALIGN(start) != start); + + if (__inv_flush_all(cache)) + return; + + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); + + if (!cache) return; /* @@ -332,7 +335,12 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, { unsigned int i, level; - if (__cpa_flush_range(baddr, numpages, cache)) + if (__inv_flush_all(cache)) + return; + + flush_tlb_all(); + + if (!cache) return; /* -- cgit v1.2.3-55-g7522