From 7cc4eb1bdd8b082f3d889daccd9412aa10e56165 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Fri, 9 Feb 2018 17:22:25 +0300 Subject: x86/boot/compressed/64: Rename pagetable.c to kaslr_64.c The name of the file -- pagetable.c -- is misleading: it only contains helpers used for KASLR in 64-bit mode. Let's rename the file to reflect its content. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180209142228.21231-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/kaslr_64.c | 157 +++++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/pagetable.c | 157 ----------------------------------- 3 files changed, 158 insertions(+), 158 deletions(-) create mode 100644 arch/x86/boot/compressed/kaslr_64.c delete mode 100644 arch/x86/boot/compressed/pagetable.c (limited to 'arch') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f25e1530e064..1f734cd98fd3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -78,7 +78,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o endif diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c new file mode 100644 index 000000000000..b5e5e02f8cde --- /dev/null +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016 Yinghai Lu + * Copyright (C) 2016 Kees Cook + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include +#include +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#include "../../mm/ident_map.c" + +/* Used by pgtable.h asm code to force instruction serialization. */ +unsigned long __force_order; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long top_level_pgt; + +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info; + +/* Locates and clears a region for a new top level page table. */ +void initialize_identity_maps(void) +{ + unsigned long sev_me_mask = get_sev_encryption_mask(); + + /* Init mapping_info with run-time function/buffer pointers. */ + mapping_info.alloc_pgt_page = alloc_pgt_page; + mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; + + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + * + * With 5-level paging, we use '_pgtable' to allocate the p4d page table, + * the top-level page table is allocated separately. + * + * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level + * cases. On 4-level paging it's equal to 'top_level_pgt'. + */ + top_level_pgt = read_cr3_pa(); + if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { + debug_putstr("booted via startup_32()\n"); + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + debug_putstr("booted via startup_64()\n"); + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); + } +} + +/* + * Adds the specified range to what will become the new identity mappings. + * Once all ranges have been added, the new mapping is activated by calling + * finalize_identity_maps() below. + */ +void add_identity_map(unsigned long start, unsigned long size) +{ + unsigned long end = start + size; + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, + start, end); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(top_level_pgt); +} diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c deleted file mode 100644 index b5e5e02f8cde..000000000000 --- a/arch/x86/boot/compressed/pagetable.c +++ /dev/null @@ -1,157 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code is used on x86_64 to create page table identity mappings on - * demand by building up a new set of page tables (or appending to the - * existing ones), and then switching over to them when ready. - * - * Copyright (C) 2015-2016 Yinghai Lu - * Copyright (C) 2016 Kees Cook - */ - -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - -/* - * The pgtable.h and mm/ident_map.c includes make use of the SME related - * information which is not used in the compressed image support. Un-define - * the SME support to avoid any compile and link errors. - */ -#undef CONFIG_AMD_MEM_ENCRYPT - -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION - -#include "misc.h" - -/* These actually do the work of building the kernel identity maps. */ -#include -#include -/* Use the static base for this part of the boot process */ -#undef __PAGE_OFFSET -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#include "../../mm/ident_map.c" - -/* Used by pgtable.h asm code to force instruction serialization. */ -unsigned long __force_order; - -/* Used to track our page table allocation area. */ -struct alloc_pgt_data { - unsigned char *pgt_buf; - unsigned long pgt_buf_size; - unsigned long pgt_buf_offset; -}; - -/* - * Allocates space for a page table entry, using struct alloc_pgt_data - * above. Besides the local callers, this is used as the allocation - * callback in mapping_info below. - */ -static void *alloc_pgt_page(void *context) -{ - struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; - unsigned char *entry; - - /* Validate there is space available for a new page. */ - if (pages->pgt_buf_offset >= pages->pgt_buf_size) { - debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); - debug_putaddr(pages->pgt_buf_offset); - debug_putaddr(pages->pgt_buf_size); - return NULL; - } - - entry = pages->pgt_buf + pages->pgt_buf_offset; - pages->pgt_buf_offset += PAGE_SIZE; - - return entry; -} - -/* Used to track our allocated page tables. */ -static struct alloc_pgt_data pgt_data; - -/* The top level page table entry pointer. */ -static unsigned long top_level_pgt; - -/* - * Mapping information structure passed to kernel_ident_mapping_init(). - * Due to relocation, pointers must be assigned at run time not build time. - */ -static struct x86_mapping_info mapping_info; - -/* Locates and clears a region for a new top level page table. */ -void initialize_identity_maps(void) -{ - unsigned long sev_me_mask = get_sev_encryption_mask(); - - /* Init mapping_info with run-time function/buffer pointers. */ - mapping_info.alloc_pgt_page = alloc_pgt_page; - mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; - - /* - * It should be impossible for this not to already be true, - * but since calling this a second time would rewind the other - * counters, let's just make sure this is reset too. - */ - pgt_data.pgt_buf_offset = 0; - - /* - * If we came here via startup_32(), cr3 will be _pgtable already - * and we must append to the existing area instead of entirely - * overwriting it. - * - * With 5-level paging, we use '_pgtable' to allocate the p4d page table, - * the top-level page table is allocated separately. - * - * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level - * cases. On 4-level paging it's equal to 'top_level_pgt'. - */ - top_level_pgt = read_cr3_pa(); - if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { - debug_putstr("booted via startup_32()\n"); - pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - } else { - debug_putstr("booted via startup_64()\n"); - pgt_data.pgt_buf = _pgtable; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); - } -} - -/* - * Adds the specified range to what will become the new identity mappings. - * Once all ranges have been added, the new mapping is activated by calling - * finalize_identity_maps() below. - */ -void add_identity_map(unsigned long start, unsigned long size) -{ - unsigned long end = start + size; - - /* Align boundary to 2M. */ - start = round_down(start, PMD_SIZE); - end = round_up(end, PMD_SIZE); - if (start >= end) - return; - - /* Build the mapping. */ - kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, - start, end); -} - -/* - * This switches the page tables to the new level4 that has been built - * via calls to add_identity_map() above. If booted via startup_32(), - * this is effectively a no-op. - */ -void finalize_identity_maps(void) -{ - write_cr3(top_level_pgt); -} -- cgit v1.2.3-55-g7522 From 4440977be1347d43503f381716e4918413b5a6f0 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Fri, 9 Feb 2018 17:22:26 +0300 Subject: x86/boot/compressed/64: Introduce paging_prepare() Rename l5_paging_required() to paging_prepare() and change the interface of the function. This is a preparation for the next patch, which would make the function also allocate memory for the 32-bit trampoline. The function now returns a 128-bit structure. RAX would return trampoline memory address (zero for now) and RDX would indicate if we need to enable 5-level paging. Signed-off-by: Kirill A. Shutemov [ Typo fixes and general clarification. ] Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Cyrill Gorcunov Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180209142228.21231-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 41 ++++++++++++++++------------------- arch/x86/boot/compressed/pgtable_64.c | 25 ++++++++++----------- 2 files changed, 31 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fc313e29fe2c..d598d65db32c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -304,20 +304,6 @@ ENTRY(startup_64) /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp -#ifdef CONFIG_X86_5LEVEL - /* - * Check if we need to enable 5-level paging. - * RSI holds real mode data and need to be preserved across - * a function call. - */ - pushq %rsi - call l5_paging_required - popq %rsi - - /* If l5_paging_required() returned zero, we're done here. */ - cmpq $0, %rax - je lvl5 - /* * At this point we are in long mode with 4-level paging enabled, * but we want to enable 5-level paging. @@ -325,12 +311,28 @@ ENTRY(startup_64) * The problem is that we cannot do it directly. Setting LA57 in * long mode would trigger #GP. So we need to switch off long mode * first. + */ + + /* + * paging_prepare() sets up the trampoline and checks if we need to + * enable 5-level paging. * - * NOTE: This is not going to work if bootloader put us above 4G - * limit. + * Address of the trampoline is returned in RAX. + * Non zero RDX on return means we need to enable 5-level paging. * - * The first step is go into compatibility mode. + * RSI holds real mode data and needs to be preserved across + * this function call. */ + pushq %rsi + call paging_prepare + popq %rsi + + /* Save the trampoline address in RCX */ + movq %rax, %rcx + + /* Check if we need to enable 5-level paging */ + cmpq $0, %rdx + jz lvl5 /* Clear additional page table */ leaq lvl5_pgtable(%rbx), %rdi @@ -352,7 +354,6 @@ ENTRY(startup_64) pushq %rax lretq lvl5: -#endif /* Zero EFLAGS */ pushq $0 @@ -490,7 +491,6 @@ relocated: jmp *%rax .code32 -#ifdef CONFIG_X86_5LEVEL compatible_mode: /* Setup data and stack segments */ movl $__KERNEL_DS, %eax @@ -526,7 +526,6 @@ compatible_mode: movl %eax, %cr0 lret -#endif no_longmode: /* This isn't an x86-64 CPU so hang */ @@ -585,7 +584,5 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -#ifdef CONFIG_X86_5LEVEL lvl5_pgtable: .fill PAGE_SIZE, 1, 0 -#endif diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index b4469a37e9a1..3f1697fcc7a8 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -9,20 +9,19 @@ */ unsigned long __force_order; -int l5_paging_required(void) -{ - /* Check if leaf 7 is supported. */ - - if (native_cpuid_eax(0) < 7) - return 0; +struct paging_config { + unsigned long trampoline_start; + unsigned long l5_required; +}; - /* Check if la57 is supported. */ - if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) - return 0; +struct paging_config paging_prepare(void) +{ + struct paging_config paging_config = {}; - /* Check if 5-level paging has already been enabled. */ - if (native_read_cr4() & X86_CR4_LA57) - return 0; + /* Check if LA57 is desired and supported */ + if (IS_ENABLED(CONFIG_X86_5LEVEL) && native_cpuid_eax(0) >= 7 && + (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + paging_config.l5_required = 1; - return 1; + return paging_config; } -- cgit v1.2.3-55-g7522 From 515ab7c41306aad1f80a980e1936ef635c61570c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 31 Jan 2018 13:19:12 -0800 Subject: x86/mm: Align TLB invalidation info The TLB invalidation info is allocated on the stack, which might cause it to be unaligned. Since this information may be transferred to different cores for TLB shootdown, this may cause an additional cache line to become shared. While the overhead is likely to be small, the fix is simple. We do not use __cacheline_aligned() since it also defines the section, which is inappropriate for stack variables. Signed-off-by: Nadav Amit Acked-by: Andy Lutomirski Cc: Dave Hansen Cc: Linus Torvalds Cc: Nadav Amit Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180131211912.52064-1-namit@vmware.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8dcc0607f805..6550d37d0f06 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -613,7 +613,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { int cpu; - struct flush_tlb_info info = { + struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, }; -- cgit v1.2.3-55-g7522 From 1cd9c22fee3ac21db52a0997d08cf2f065d2c0c0 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 31 Jan 2018 16:54:02 +0300 Subject: x86/mm/encrypt: Move page table helpers into separate translation unit There are bunch of functions in mem_encrypt.c that operate on the identity mapping, which means they want virtual addresses to be equal to physical one, without PAGE_OFFSET shift. We also need to avoid paravirtualizaion call there. Getting this done is tricky. We cannot use usual page table helpers. It forces us to open-code a lot of things. It makes code ugly and hard to modify. We can get it work with the page table helpers, but it requires few preprocessor tricks. These tricks may have side effects for the rest of the file. Let's isolate such functions into own translation unit. Tested-by: Tom Lendacky Signed-off-by: Kirill A. Shutemov Reviewed-by: Tom Lendacky Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180131135404.40692-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 1 + arch/x86/mm/Makefile | 14 +- arch/x86/mm/mem_encrypt.c | 578 +---------------------------------- arch/x86/mm/mem_encrypt_identity.c | 597 +++++++++++++++++++++++++++++++++++++ 4 files changed, 608 insertions(+), 582 deletions(-) create mode 100644 arch/x86/mm/mem_encrypt_identity.c (limited to 'arch') diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 22c5f3e6f820..8fe61ad21047 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -22,6 +22,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT extern u64 sme_me_mask; +extern bool sev_enabled; void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, unsigned long decrypted_kernel_vaddr, diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 27e9e90a8d35..03c6c8561623 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,12 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 -# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c -KCOV_INSTRUMENT_tlb.o := n -KCOV_INSTRUMENT_mem_encrypt.o := n +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n +KCOV_INSTRUMENT_mem_encrypt_identity.o := n -KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ @@ -47,4 +50,5 @@ obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 1a53071e2e17..3a1b5fe4c2ca 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -25,17 +25,12 @@ #include #include #include -#include #include #include #include #include "mm_internal.h" -static char sme_cmdline_arg[] __initdata = "mem_encrypt"; -static char sme_cmdline_on[] __initdata = "on"; -static char sme_cmdline_off[] __initdata = "off"; - /* * Since SME related variables are set early in the boot process they must * reside in the .data section so as not to be zeroed out when the .bss @@ -46,7 +41,7 @@ EXPORT_SYMBOL(sme_me_mask); DEFINE_STATIC_KEY_FALSE(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key); -static bool sev_enabled __section(.data); +bool sev_enabled __section(.data); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); @@ -463,574 +458,3 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) /* Make the SWIOTLB buffer area decrypted */ set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); } - -struct sme_populate_pgd_data { - void *pgtable_area; - pgd_t *pgd; - - pmdval_t pmd_flags; - pteval_t pte_flags; - unsigned long paddr; - - unsigned long vaddr; - unsigned long vaddr_end; -}; - -static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) -{ - unsigned long pgd_start, pgd_end, pgd_size; - pgd_t *pgd_p; - - pgd_start = ppd->vaddr & PGDIR_MASK; - pgd_end = ppd->vaddr_end & PGDIR_MASK; - - pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - - memset(pgd_p, 0, pgd_size); -} - -#define PGD_FLAGS _KERNPG_TABLE_NOENC -#define P4D_FLAGS _KERNPG_TABLE_NOENC -#define PUD_FLAGS _KERNPG_TABLE_NOENC -#define PMD_FLAGS _KERNPG_TABLE_NOENC - -#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) - -#define PMD_FLAGS_DEC PMD_FLAGS_LARGE -#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) - -#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) - -#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) - -#define PTE_FLAGS_DEC PTE_FLAGS -#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) - -#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) - -static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) -{ - pgd_t *pgd_p; - p4d_t *p4d_p; - pud_t *pud_p; - pmd_t *pmd_p; - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - if (native_pgd_val(*pgd_p)) { - if (IS_ENABLED(CONFIG_X86_5LEVEL)) - p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - else - pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - } else { - pgd_t pgd; - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p = ppd->pgtable_area; - memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); - ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; - - pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); - } else { - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); - } - native_set_pgd(pgd_p, pgd); - } - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p += p4d_index(ppd->vaddr); - if (native_p4d_val(*p4d_p)) { - pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); - } else { - p4d_t p4d; - - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); - native_set_p4d(p4d_p, p4d); - } - } - - pud_p += pud_index(ppd->vaddr); - if (native_pud_val(*pud_p)) { - if (native_pud_val(*pud_p) & _PAGE_PSE) - return NULL; - - pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); - } else { - pud_t pud; - - pmd_p = ppd->pgtable_area; - memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); - ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; - - pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); - native_set_pud(pud_p, pud); - } - - return pmd_p; -} - -static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) -{ - pmd_t *pmd_p; - - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) - return; - - pmd_p += pmd_index(ppd->vaddr); - if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); -} - -static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) -{ - pmd_t *pmd_p; - pte_t *pte_p; - - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) - return; - - pmd_p += pmd_index(ppd->vaddr); - if (native_pmd_val(*pmd_p)) { - if (native_pmd_val(*pmd_p) & _PAGE_PSE) - return; - - pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); - } else { - pmd_t pmd; - - pte_p = ppd->pgtable_area; - memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); - ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; - - pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); - native_set_pmd(pmd_p, pmd); - } - - pte_p += pte_index(ppd->vaddr); - if (!native_pte_val(*pte_p)) - native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); -} - -static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd_large(ppd); - - ppd->vaddr += PMD_PAGE_SIZE; - ppd->paddr += PMD_PAGE_SIZE; - } -} - -static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) -{ - while (ppd->vaddr < ppd->vaddr_end) { - sme_populate_pgd(ppd); - - ppd->vaddr += PAGE_SIZE; - ppd->paddr += PAGE_SIZE; - } -} - -static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, - pmdval_t pmd_flags, pteval_t pte_flags) -{ - unsigned long vaddr_end; - - ppd->pmd_flags = pmd_flags; - ppd->pte_flags = pte_flags; - - /* Save original end value since we modify the struct value */ - vaddr_end = ppd->vaddr_end; - - /* If start is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); - __sme_map_range_pte(ppd); - - /* Create PMD entries */ - ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; - __sme_map_range_pmd(ppd); - - /* If end is not 2MB aligned, create PTE entries */ - ppd->vaddr_end = vaddr_end; - __sme_map_range_pte(ppd); -} - -static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); -} - -static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); -} - -static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) -{ - __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); -} - -static unsigned long __init sme_pgtable_calc(unsigned long len) -{ - unsigned long p4d_size, pud_size, pmd_size, pte_size; - unsigned long total; - - /* - * Perform a relatively simplistic calculation of the pagetable - * entries that are needed. Those mappings will be covered mostly - * by 2MB PMD entries so we can conservatively calculate the required - * number of P4D, PUD and PMD structures needed to perform the - * mappings. For mappings that are not 2MB aligned, PTE mappings - * would be needed for the start and end portion of the address range - * that fall outside of the 2MB alignment. This results in, at most, - * two extra pages to hold PTE entries for each range that is mapped. - * Incrementing the count for each covers the case where the addresses - * cross entries. - */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; - - total = p4d_size + pud_size + pmd_size + pte_size; - - /* - * Now calculate the added pagetable structures needed to populate - * the new pagetables. - */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - - total += p4d_size + pud_size + pmd_size; - - return total; -} - -void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) -{ - unsigned long workarea_start, workarea_end, workarea_len; - unsigned long execute_start, execute_end, execute_len; - unsigned long kernel_start, kernel_end, kernel_len; - unsigned long initrd_start, initrd_end, initrd_len; - struct sme_populate_pgd_data ppd; - unsigned long pgtable_area_len; - unsigned long decrypted_base; - - if (!sme_active()) - return; - - /* - * Prepare for encrypting the kernel and initrd by building new - * pagetables with the necessary attributes needed to encrypt the - * kernel in place. - * - * One range of virtual addresses will map the memory occupied - * by the kernel and initrd as encrypted. - * - * Another range of virtual addresses will map the memory occupied - * by the kernel and initrd as decrypted and write-protected. - * - * The use of write-protect attribute will prevent any of the - * memory from being cached. - */ - - /* Physical addresses gives us the identity mapped virtual addresses */ - kernel_start = __pa_symbol(_text); - kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); - kernel_len = kernel_end - kernel_start; - - initrd_start = 0; - initrd_end = 0; - initrd_len = 0; -#ifdef CONFIG_BLK_DEV_INITRD - initrd_len = (unsigned long)bp->hdr.ramdisk_size | - ((unsigned long)bp->ext_ramdisk_size << 32); - if (initrd_len) { - initrd_start = (unsigned long)bp->hdr.ramdisk_image | - ((unsigned long)bp->ext_ramdisk_image << 32); - initrd_end = PAGE_ALIGN(initrd_start + initrd_len); - initrd_len = initrd_end - initrd_start; - } -#endif - - /* Set the encryption workarea to be immediately after the kernel */ - workarea_start = kernel_end; - - /* - * Calculate required number of workarea bytes needed: - * executable encryption area size: - * stack page (PAGE_SIZE) - * encryption routine page (PAGE_SIZE) - * intermediate copy buffer (PMD_PAGE_SIZE) - * pagetable structures for the encryption of the kernel - * pagetable structures for workarea (in case not currently mapped) - */ - execute_start = workarea_start; - execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; - execute_len = execute_end - execute_start; - - /* - * One PGD for both encrypted and decrypted mappings and a set of - * PUDs and PMDs for each of the encrypted and decrypted mappings. - */ - pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; - pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; - if (initrd_len) - pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; - - /* PUDs and PMDs needed in the current pagetables for the workarea */ - pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); - - /* - * The total workarea includes the executable encryption area and - * the pagetable area. The start of the workarea is already 2MB - * aligned, align the end of the workarea on a 2MB boundary so that - * we don't try to create/allocate PTE entries from the workarea - * before it is mapped. - */ - workarea_len = execute_len + pgtable_area_len; - workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); - - /* - * Set the address to the start of where newly created pagetable - * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable - * structures are created when the workarea is added to the current - * pagetables and when the new encrypted and decrypted kernel - * mappings are populated. - */ - ppd.pgtable_area = (void *)execute_end; - - /* - * Make sure the current pagetable structure has entries for - * addressing the workarea. - */ - ppd.pgd = (pgd_t *)native_read_cr3_pa(); - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); - - /* - * A new pagetable structure is being built to allow for the kernel - * and initrd to be encrypted. It starts with an empty PGD that will - * then be populated with new PUDs and PMDs as the encrypted and - * decrypted kernel mappings are created. - */ - ppd.pgd = ppd.pgtable_area; - memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); - ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; - - /* - * A different PGD index/entry must be used to get different - * pagetable entries for the decrypted mapping. Choose the next - * PGD index and convert it to a virtual address to be used as - * the base of the mapping. - */ - decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); - if (initrd_len) { - unsigned long check_base; - - check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); - decrypted_base = max(decrypted_base, check_base); - } - decrypted_base <<= PGDIR_SHIFT; - - /* Add encrypted kernel (identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start; - ppd.vaddr_end = kernel_end; - sme_map_range_encrypted(&ppd); - - /* Add decrypted, write-protected kernel (non-identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - - if (initrd_len) { - /* Add encrypted initrd (identity) mappings */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start; - ppd.vaddr_end = initrd_end; - sme_map_range_encrypted(&ppd); - /* - * Add decrypted, write-protected initrd (non-identity) mappings - */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); - } - - /* Add decrypted workarea mappings to both kernel mappings */ - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); - - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_map_range_decrypted(&ppd); - - /* Perform the encryption */ - sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)ppd.pgd); - - if (initrd_len) - sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, - initrd_len, workarea_start, - (unsigned long)ppd.pgd); - - /* - * At this point we are running encrypted. Remove the mappings for - * the decrypted areas - all that is needed for this is to remove - * the PGD entry/entries. - */ - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_clear_pgd(&ppd); - - if (initrd_len) { - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_clear_pgd(&ppd); - } - - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_clear_pgd(&ppd); - - /* Flush the TLB - no globals so cr3 is enough */ - native_write_cr3(__native_read_cr3()); -} - -void __init __nostackprotector sme_enable(struct boot_params *bp) -{ - const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; - unsigned int eax, ebx, ecx, edx; - unsigned long feature_mask; - bool active_by_default; - unsigned long me_mask; - char buffer[16]; - u64 msr; - - /* Check for the SME/SEV support leaf */ - eax = 0x80000000; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (eax < 0x8000001f) - return; - -#define AMD_SME_BIT BIT(0) -#define AMD_SEV_BIT BIT(1) - /* - * Set the feature mask (SME or SEV) based on whether we are - * running under a hypervisor. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; - - /* - * Check for the SME/SEV feature: - * CPUID Fn8000_001F[EAX] - * - Bit 0 - Secure Memory Encryption support - * - Bit 1 - Secure Encrypted Virtualization support - * CPUID Fn8000_001F[EBX] - * - Bits 5:0 - Pagetable bit position used to indicate encryption - */ - eax = 0x8000001f; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & feature_mask)) - return; - - me_mask = 1UL << (ebx & 0x3f); - - /* Check if memory encryption is enabled */ - if (feature_mask == AMD_SME_BIT) { - /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) - return; - } else { - /* For SEV, check the SEV MSR */ - msr = __rdmsr(MSR_AMD64_SEV); - if (!(msr & MSR_AMD64_SEV_ENABLED)) - return; - - /* SEV state cannot be controlled by a command line option */ - sme_me_mask = me_mask; - sev_enabled = true; - return; - } - - /* - * Fixups have not been applied to phys_base yet and we're running - * identity mapped, so we must obtain the address to the SME command - * line argument data using rip-relative addressing. - */ - asm ("lea sme_cmdline_arg(%%rip), %0" - : "=r" (cmdline_arg) - : "p" (sme_cmdline_arg)); - asm ("lea sme_cmdline_on(%%rip), %0" - : "=r" (cmdline_on) - : "p" (sme_cmdline_on)); - asm ("lea sme_cmdline_off(%%rip), %0" - : "=r" (cmdline_off) - : "p" (sme_cmdline_off)); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) - active_by_default = true; - else - active_by_default = false; - - cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | - ((u64)bp->ext_cmd_line_ptr << 32)); - - cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); - - if (!strncmp(buffer, cmdline_on, sizeof(buffer))) - sme_me_mask = me_mask; - else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) - sme_me_mask = 0; - else - sme_me_mask = active_by_default ? me_mask : 0; -} diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c new file mode 100644 index 000000000000..a28978a37bfa --- /dev/null +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -0,0 +1,597 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define DISABLE_BRANCH_PROFILING + +#include +#include + +#include +#include +#include + +#include "mm_internal.h" + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); + + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); + + memset(pgd_p, 0, pgd_size); +} + +static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +{ + pgd_t *pgd_p; + p4d_t *p4d_p; + pud_t *pud_p; + pmd_t *pmd_p; + + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); + if (native_pgd_val(*pgd_p)) { + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + else + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + } else { + pgd_t pgd; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p = ppd->pgtable_area; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); + } else { + pud_p = ppd->pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); + } + native_set_pgd(pgd_p, pgd); + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p += p4d_index(ppd->vaddr); + if (native_p4d_val(*p4d_p)) { + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); + } else { + p4d_t p4d; + + pud_p = ppd->pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); + native_set_p4d(p4d_p, p4d); + } + } + + pud_p += pud_index(ppd->vaddr); + if (native_pud_val(*pud_p)) { + if (native_pud_val(*pud_p) & _PAGE_PSE) + return NULL; + + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); + } else { + pud_t pud; + + pmd_p = ppd->pgtable_area; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); + native_set_pud(pud_p, pud); + } + + return pmd_p; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pmd_t *pmd_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + + pmd_p += pmd_index(ppd->vaddr); + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) + native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); +} + +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pmd_t *pmd_p; + pte_t *pte_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + + pmd_p += pmd_index(ppd->vaddr); + if (native_pmd_val(*pmd_p)) { + if (native_pmd_val(*pmd_p) & _PAGE_PSE) + return; + + pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); + } else { + pmd_t pmd; + + pte_p = ppd->pgtable_area; + memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; + + pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); + native_set_pmd(pmd_p, pmd); + } + + pte_p += pte_index(ppd->vaddr); + if (!native_pte_val(*pte_p)) + native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_PAGE_SIZE; + ppd->paddr += PMD_PAGE_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long p4d_size, pud_size, pmd_size, pte_size; + unsigned long total; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; + + total = p4d_size + pud_size + pmd_size + pte_size; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total += p4d_size + pud_size + pmd_size; + + return total; +} + +void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; + unsigned long pgtable_area_len; + unsigned long decrypted_base; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel and initrd as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel and initrd as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + ppd.pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. + */ + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } + decrypted_base <<= PGDIR_SHIFT; + + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + } + + /* Add decrypted workarea mappings to both kernel mappings */ + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } + + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init __nostackprotector sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) + /* + * Set the feature mask (SME or SEV) based on whether we are + * running under a hypervisor. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & feature_mask)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* For SEV, check the SEV MSR */ + msr = __rdmsr(MSR_AMD64_SEV); + if (!(msr & MSR_AMD64_SEV_ENABLED)) + return; + + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + sev_enabled = true; + return; + } + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} -- cgit v1.2.3-55-g7522 From aad983913d77af2c3394f29b88d7bb75ebd7d172 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 31 Jan 2018 16:54:03 +0300 Subject: x86/mm/encrypt: Simplify sme_populate_pgd() and sme_populate_pgd_large() sme_populate_pgd() and sme_populate_pgd_large() operate on the identity mapping, which means they want virtual addresses to be equal to physical one, without PAGE_OFFSET shift. We also need to avoid paravirtualization call there. Getting this done is tricky. We cannot use usual page table helpers. It forces us to open-code a lot of things. It makes code ugly and hard to modify. We can get it work with the page table helpers, but it requires few preprocessor tricks. - Define __pa() and __va() to be compatible with identity mapping. - Undef CONFIG_PARAVIRT and CONFIG_PARAVIRT_SPINLOCKS before including any file. This way we can avoid paravirtualization calls. Now we can user normal page table helpers just fine. Tested-by: Tom Lendacky Signed-off-by: Kirill A. Shutemov Reviewed-by: Tom Lendacky Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180131135404.40692-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt_identity.c | 159 +++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 87 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index a28978a37bfa..4b6a2e3098c5 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -12,6 +12,24 @@ #define DISABLE_BRANCH_PROFILING +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* + * Special hack: we have to be careful, because no indirections are + * allowed here, and paravirt_ops is a kind of one. As it will only run in + * baremetal anyway, we just keep it from happening. (This list needs to + * be extended when new paravirt and debugging variants are added.) + */ +#undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_SPINLOCKS + +#include #include #include @@ -73,116 +91,83 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { - pgd_t *pgd_p; - p4d_t *p4d_p; - pud_t *pud_p; - pmd_t *pmd_p; - - pgd_p = ppd->pgd + pgd_index(ppd->vaddr); - if (native_pgd_val(*pgd_p)) { - if (IS_ENABLED(CONFIG_X86_5LEVEL)) - p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - else - pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); - } else { - pgd_t pgd; - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p = ppd->pgtable_area; - memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); - ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; - - pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); - } else { - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); - } - native_set_pgd(pgd_p, pgd); + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = ppd->pgd + pgd_index(ppd->vaddr); + if (pgd_none(*pgd)) { + p4d = ppd->pgtable_area; + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D); + ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D; + set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d))); } - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p += p4d_index(ppd->vaddr); - if (native_p4d_val(*p4d_p)) { - pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); - } else { - p4d_t p4d; - - pud_p = ppd->pgtable_area; - memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; - - p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); - native_set_p4d(p4d_p, p4d); - } + p4d = p4d_offset(pgd, ppd->vaddr); + if (p4d_none(*p4d)) { + pud = ppd->pgtable_area; + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD; + set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud))); } - pud_p += pud_index(ppd->vaddr); - if (native_pud_val(*pud_p)) { - if (native_pud_val(*pud_p) & _PAGE_PSE) - return NULL; - - pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); - } else { - pud_t pud; - - pmd_p = ppd->pgtable_area; - memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); - ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; - - pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); - native_set_pud(pud_p, pud); + pud = pud_offset(p4d, ppd->vaddr); + if (pud_none(*pud)) { + pmd = ppd->pgtable_area; + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); + ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD; + set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); } - return pmd_p; + if (pud_large(*pud)) + return NULL; + + return pud; } static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { - pmd_t *pmd_p; + pud_t *pud; + pmd_t *pmd; - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) + pud = sme_prepare_pgd(ppd); + if (!pud) return; - pmd_p += pmd_index(ppd->vaddr); - if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_large(*pmd)) + return; + + set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) { - pmd_t *pmd_p; - pte_t *pte_p; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; - pmd_p = sme_prepare_pgd(ppd); - if (!pmd_p) + pud = sme_prepare_pgd(ppd); + if (!pud) return; - pmd_p += pmd_index(ppd->vaddr); - if (native_pmd_val(*pmd_p)) { - if (native_pmd_val(*pmd_p) & _PAGE_PSE) - return; - - pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); - } else { - pmd_t pmd; - - pte_p = ppd->pgtable_area; - memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); - ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; - - pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); - native_set_pmd(pmd_p, pmd); + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_none(*pmd)) { + pte = ppd->pgtable_area; + memset(pte, 0, sizeof(pte) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE; + set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); } - pte_p += pte_index(ppd->vaddr); - if (!native_pte_val(*pte_p)) - native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); + if (pmd_large(*pmd)) + return; + + pte = pte_offset_map(pmd, ppd->vaddr); + if (pte_none(*pte)) + set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) -- cgit v1.2.3-55-g7522 From 1070730c1ad2bd49b0d11112728f1f4390137728 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 31 Jan 2018 16:54:04 +0300 Subject: x86/mm/encrypt: Simplify sme_pgtable_calc() sme_pgtable_calc() is unnecessary complex. It can be re-written in a more stream-lined way. As a side effect, we would get the code ready to boot-time switching between paging modes. Tested-by: Tom Lendacky Signed-off-by: Kirill A. Shutemov Reviewed-by: Tom Lendacky Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180131135404.40692-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt_identity.c | 42 +++++++++++--------------------------- 1 file changed, 12 insertions(+), 30 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 4b6a2e3098c5..b4139c5ab972 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -231,8 +231,7 @@ static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) static unsigned long __init sme_pgtable_calc(unsigned long len) { - unsigned long p4d_size, pud_size, pmd_size, pte_size; - unsigned long total; + unsigned long entries = 0, tables = 0; /* * Perform a relatively simplistic calculation of the pagetable @@ -246,42 +245,25 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) * Incrementing the count for each covers the case where the addresses * cross entries. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; - total = p4d_size + pud_size + pmd_size + pte_size; + /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */ + if (PTRS_PER_P4D > 1) + entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D; + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD; + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD; + entries += 2 * sizeof(pte_t) * PTRS_PER_PTE; /* * Now calculate the added pagetable structures needed to populate * the new pagetables. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; - pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } else { - p4d_size = 0; - pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; - pud_size *= sizeof(pud_t) * PTRS_PER_PUD; - } - pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; - pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; - total += p4d_size + pud_size + pmd_size; + if (PTRS_PER_P4D > 1) + tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D; + tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD; + tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD; - return total; + return entries + tables; } void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) -- cgit v1.2.3-55-g7522 From 116fef6408599dd6ff6996235c50aa692e9b5631 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 31 Jan 2018 07:56:22 -0800 Subject: x86/mm/dump_pagetables: Add the EFI pagetable to the debugfs 'page_tables' directory EFI is complicated enough that being able to view its pagetables is quite helpful. Rather than requiring users to fish it out of dmesg on an appropriately configured kernel, let users view it in debugfs as well. Signed-off-by: Andy Lutomirski Acked-by: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ba158a93f3250e6fca752cff2cfb1fcdd9f2b50c.1517414050.git.luto@kernel.org [ Fixed trivial whitespace damage and fixed missing export. ] Signed-off-by: Ingo Molnar --- arch/x86/mm/debug_pagetables.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/platform/efi/efi_64.c | 4 +++- 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 421f2664ffa0..51a6f92da2bf 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -72,6 +72,31 @@ static const struct file_operations ptdump_curusr_fops = { }; #endif +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) +extern pgd_t *efi_pgd; +static struct dentry *pe_efi; + +static int ptdump_show_efi(struct seq_file *m, void *v) +{ + if (efi_pgd) + ptdump_walk_pgd_level_debugfs(m, efi_pgd, false); + return 0; +} + +static int ptdump_open_efi(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_efi, NULL); +} + +static const struct file_operations ptdump_efi_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_efi, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static struct dentry *dir, *pe_knl, *pe_curknl; static int __init pt_dump_debug_init(void) @@ -96,6 +121,13 @@ static int __init pt_dump_debug_init(void) if (!pe_curusr) goto err; #endif + +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) + pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops); + if (!pe_efi) + goto err; +#endif + return 0; err: debugfs_remove_recursive(dir); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index c310a8284358..780460aa5ea5 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -190,7 +191,8 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) early_code_mapping_set_exec(0); } -static pgd_t *efi_pgd; +pgd_t *efi_pgd; +EXPORT_SYMBOL_GPL(efi_pgd); /* * We need our own copy of the higher levels of the page tables -- cgit v1.2.3-55-g7522 From b83ce5ee91471d19c403ff91227204fb37c95fb2 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:48 +0300 Subject: x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52 __PHYSICAL_MASK_SHIFT is used to define the mask that helps to extract physical address from a page table entry. Although real physical address space available may differ between machines, it's safe to use 52 as __PHYSICAL_MASK_SHIFT. Unused bits above log2(MAXPHYADDR) up to bit 51 are reserved and must be 0. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_64_types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index e1407312c412..f68e6526891d 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -52,11 +52,12 @@ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#ifdef CONFIG_X86_5LEVEL + #define __PHYSICAL_MASK_SHIFT 52 + +#ifdef CONFIG_X86_5LEVEL #define __VIRTUAL_MASK_SHIFT 56 #else -#define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 47 #endif -- cgit v1.2.3-55-g7522 From 02390b87a9459937cdb299e6b34ff33992512ec7 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:49 +0300 Subject: mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS With boot-time switching between paging mode we will have variable MAX_PHYSMEM_BITS. Let's use the maximum variable possible for CONFIG_X86_5LEVEL=y configuration to define zsmalloc data structures. The patch introduces MAX_POSSIBLE_PHYSMEM_BITS to cover such case. It also suits well to handle PAE special case. Signed-off-by: Kirill A. Shutemov Reviewed-by: Nitin Gupta Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable_64_types.h | 2 ++ mm/zsmalloc.c | 13 +++++++------ 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 876b4c77d983..6a59a6d0cc50 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -44,5 +44,6 @@ typedef union { */ #define PTRS_PER_PTE 512 +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6b8f73dcbc2c..7168de7d34eb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -40,6 +40,8 @@ typedef struct { pteval_t pte; } pte_t; #define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) #define P4D_MASK (~(P4D_SIZE - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 52 + #else /* CONFIG_X86_5LEVEL */ /* diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c3013505c305..b7f61cd1c709 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -84,18 +84,19 @@ * This is made more complicated by various memory models and PAE. */ -#ifndef MAX_PHYSMEM_BITS -#ifdef CONFIG_HIGHMEM64G -#define MAX_PHYSMEM_BITS 36 -#else /* !CONFIG_HIGHMEM64G */ +#ifndef MAX_POSSIBLE_PHYSMEM_BITS +#ifdef MAX_PHYSMEM_BITS +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS +#else /* * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just * be PAGE_SHIFT */ -#define MAX_PHYSMEM_BITS BITS_PER_LONG +#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG #endif #endif -#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) + +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) /* * Memory for allocating for handle keeps object position by -- cgit v1.2.3-55-g7522 From eedb92abb9bb03ef21442614a6f5867eaac6e77f Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:50 +0300 Subject: x86/mm: Make virtual memory layout dynamic for CONFIG_X86_5LEVEL=y We need to be able to adjust virtual memory layout at runtime to be able to switch between 4- and 5-level paging at boot-time. KASLR already has movable __VMALLOC_BASE, __VMEMMAP_BASE and __PAGE_OFFSET. Let's re-use it. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 ++++++++ arch/x86/include/asm/kaslr.h | 4 ---- arch/x86/include/asm/page_64.h | 4 ++++ arch/x86/include/asm/page_64_types.h | 4 ++-- arch/x86/include/asm/pgtable_64_types.h | 4 ++-- arch/x86/kernel/head64.c | 9 +++++++++ arch/x86/mm/kaslr.c | 8 -------- 7 files changed, 25 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 63bf349b2b24..92256489b8a4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1430,6 +1430,7 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + select DYNAMIC_MEMORY_LAYOUT depends on X86_64 ---help--- 5-level paging enables access to larger address space: @@ -2143,10 +2144,17 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. +config DYNAMIC_MEMORY_LAYOUT + bool + ---help--- + This option makes base addresses of vmalloc and vmemmap as well as + __PAGE_OFFSET movable during boot. + config RANDOMIZE_MEMORY bool "Randomize the kernel memory sections" depends on X86_64 depends on RANDOMIZE_BASE + select DYNAMIC_MEMORY_LAYOUT default RANDOMIZE_BASE ---help--- Randomizes the base virtual address of kernel memory sections diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 460991e3b529..db7ba2feb947 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -5,10 +5,6 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY -extern unsigned long page_offset_base; -extern unsigned long vmalloc_base; -extern unsigned long vmemmap_base; - void kernel_randomize_memory(void); #else static inline void kernel_randomize_memory(void) { } diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4baa6bceb232..096378650142 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -11,6 +11,10 @@ extern unsigned long max_pfn; extern unsigned long phys_base; +extern unsigned long page_offset_base; +extern unsigned long vmalloc_base; +extern unsigned long vmemmap_base; + static inline unsigned long __phys_addr_nodebug(unsigned long x) { unsigned long y = x - __START_KERNEL_map; diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index f68e6526891d..d54a3d5b5b3b 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -43,11 +43,11 @@ #define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) #endif -#ifdef CONFIG_RANDOMIZE_MEMORY +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT #define __PAGE_OFFSET page_offset_base #else #define __PAGE_OFFSET __PAGE_OFFSET_BASE -#endif /* CONFIG_RANDOMIZE_MEMORY */ +#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 7168de7d34eb..a0db91ab63b8 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -100,13 +100,13 @@ typedef struct { pteval_t pte; } pte_t; # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif -#ifdef CONFIG_RANDOMIZE_MEMORY +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base # define VMEMMAP_START vmemmap_base #else # define VMALLOC_START __VMALLOC_BASE # define VMEMMAP_START __VMEMMAP_BASE -#endif /* CONFIG_RANDOMIZE_MEMORY */ +#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7ba5d819ebe3..bf5c9ba63ba1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,15 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT +unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE; +EXPORT_SYMBOL(page_offset_base); +unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE; +EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE; +EXPORT_SYMBOL(vmemmap_base); +#endif + #define __head __section(.head.text) static void __head *fixup_pointer(void *ptr, unsigned long physaddr) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aedebd2ebf1e..515b98a8ccee 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -43,14 +43,6 @@ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; -/* Default values */ -unsigned long page_offset_base = __PAGE_OFFSET_BASE; -EXPORT_SYMBOL(page_offset_base); -unsigned long vmalloc_base = __VMALLOC_BASE; -EXPORT_SYMBOL(vmalloc_base); -unsigned long vmemmap_base = __VMEMMAP_BASE; -EXPORT_SYMBOL(vmemmap_base); - /* * Memory regions randomized by KASLR (except modules that use a separate logic * earlier during boot). The list is ordered based on virtual addresses. This -- cgit v1.2.3-55-g7522 From e626e6bb0dfaca41487241d49ce0ae827716101a Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:51 +0300 Subject: x86/mm: Introduce 'pgtable_l5_enabled' The new flag would indicate what paging mode we are in. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 4 ++++ arch/x86/include/asm/pgtable_32_types.h | 2 ++ arch/x86/include/asm/pgtable_64_types.h | 6 ++++++ arch/x86/kernel/head64.c | 5 +++++ 4 files changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8199a6187251..bd69e1830fbe 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -46,6 +46,10 @@ #define STATIC #include +#ifdef CONFIG_X86_5LEVEL +unsigned int pgtable_l5_enabled __ro_after_init = 1; +#endif + extern unsigned long get_cmd_line_ptr(void); /* Simplified build-specific string for starting entropy. */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 0777e18a1d23..e3225e83db7d 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -15,6 +15,8 @@ # include #endif +#define pgtable_l5_enabled 0 + #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index a0db91ab63b8..5e2d724f8f47 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -20,6 +20,12 @@ typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; +#ifdef CONFIG_X86_5LEVEL +extern unsigned int pgtable_l5_enabled; +#else +#define pgtable_l5_enabled 0 +#endif + #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index bf5c9ba63ba1..17d00d1886de 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,11 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#ifdef CONFIG_X86_5LEVEL +unsigned int pgtable_l5_enabled __ro_after_init = 1; +EXPORT_SYMBOL(pgtable_l5_enabled); +#endif + #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE; EXPORT_SYMBOL(page_offset_base); -- cgit v1.2.3-55-g7522 From 5c7919bb1994f8dc7fed219a5db09e6bb9d473a5 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:52 +0300 Subject: x86/mm: Make LDT_BASE_ADDR dynamic LDT_BASE_ADDR has different value in 4- and 5-level paging configurations. We need to make it dynamic in preparation for boot-time switching between paging modes. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64_types.h | 9 +++++---- arch/x86/mm/dump_pagetables.c | 5 ++++- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 5e2d724f8f47..903e4d054bcb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -92,18 +92,19 @@ extern unsigned int pgtable_l5_enabled; */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define LDT_PGD_ENTRY_L4 -3UL +#define LDT_PGD_ENTRY_L5 -112UL +#define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) +#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) + #ifdef CONFIG_X86_5LEVEL # define VMALLOC_SIZE_TB _AC(12800, UL) # define __VMALLOC_BASE _AC(0xffa0000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) -# define LDT_PGD_ENTRY _AC(-112, UL) -# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #else # define VMALLOC_SIZE_TB _AC(32, UL) # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -# define LDT_PGD_ENTRY _AC(-3, UL) -# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2a4849e92831..a89f2dbc3531 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -89,7 +89,7 @@ static struct addr_marker address_markers[] = { [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL - [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, + [LDT_NR] = { 0UL, "LDT remap" }, #endif [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, #ifdef CONFIG_X86_ESPFIX64 @@ -570,6 +570,9 @@ static int __init pt_dump_init(void) address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +#endif #endif #ifdef CONFIG_X86_32 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; -- cgit v1.2.3-55-g7522 From c65e774fb3f6af212641538694b9778ff9ab4300 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:53 +0300 Subject: x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable For boot-time switching between 4- and 5-level paging we need to be able to fold p4d page table level at runtime. It requires variable PGDIR_SHIFT and PTRS_PER_P4D. The change doesn't affect the kernel image size much: text data bss dec hex filename 8628091 4734304 1368064 14730459 e0c4db vmlinux.before 8628393 4734340 1368064 14730797 e0c62d vmlinux.after Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-7-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 2 ++ arch/x86/include/asm/pgtable_32.h | 2 ++ arch/x86/include/asm/pgtable_64_types.h | 19 ++++++++++++------- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++------------ arch/x86/kernel/head64.c | 6 +++++- arch/x86/mm/dump_pagetables.c | 12 +++++------- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/kasan_init_64.c | 2 +- arch/x86/platform/efi/efi_64.c | 4 ++-- include/asm-generic/5level-fixup.h | 1 + include/asm-generic/pgtable-nop4d.h | 9 +++++---- include/linux/kasan.h | 2 +- mm/kasan/kasan_init.c | 2 +- 13 files changed, 44 insertions(+), 37 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index bd69e1830fbe..b18e8f9512de 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -48,6 +48,8 @@ #ifdef CONFIG_X86_5LEVEL unsigned int pgtable_l5_enabled __ro_after_init = 1; +unsigned int pgdir_shift __ro_after_init = 48; +unsigned int ptrs_per_p4d __ro_after_init = 512; #endif extern unsigned long get_cmd_line_ptr(void); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index e67c0620aec2..d829360e26bd 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -33,6 +33,8 @@ static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } void paging_init(void); +static inline int pgd_large(pgd_t pgd) { return 0; } + /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 903e4d054bcb..0c48d80e11d4 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -26,6 +26,9 @@ extern unsigned int pgtable_l5_enabled; #define pgtable_l5_enabled 0 #endif +extern unsigned int pgdir_shift; +extern unsigned int ptrs_per_p4d; + #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 @@ -35,16 +38,17 @@ extern unsigned int pgtable_l5_enabled; /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 48 +#define PGDIR_SHIFT pgdir_shift #define PTRS_PER_PGD 512 /* * 4th level page in 5-level paging case */ -#define P4D_SHIFT 39 -#define PTRS_PER_P4D 512 -#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE - 1)) +#define P4D_SHIFT 39 +#define MAX_PTRS_PER_P4D 512 +#define PTRS_PER_P4D ptrs_per_p4d +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE - 1)) #define MAX_POSSIBLE_PHYSMEM_BITS 52 @@ -53,8 +57,9 @@ extern unsigned int pgtable_l5_enabled; /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 +#define MAX_PTRS_PER_P4D 1 #endif /* CONFIG_X86_5LEVEL */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3a8e88a611eb..cbb3af721291 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1082,19 +1082,7 @@ void arch_unmap_kpfn(unsigned long pfn) * a legal address. */ -/* - * Build time check to see if we have a spare virtual bit. Don't want - * to leave this until run time because most developers don't have a - * system that can exercise this code path. This will only become a - * problem if/when we move beyond 5-level page tables. - * - * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) - */ -#if PGDIR_SHIFT + 9 < 63 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); -#else -#error "no unused virtual bit available" -#endif if (set_memory_np(decoy_addr, 1)) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); @@ -2328,6 +2316,12 @@ static __init int mcheck_init_device(void) { int err; + /* + * Check if we have a spare virtual bit. This will only become + * a problem if/when we move beyond 5-level page tables. + */ + MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); + if (!mce_available(&boot_cpu_data)) { err = -EIO; goto err_out; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 17d00d1886de..98b0ff49b220 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -42,6 +42,10 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL unsigned int pgtable_l5_enabled __ro_after_init = 1; EXPORT_SYMBOL(pgtable_l5_enabled); +unsigned int pgdir_shift __ro_after_init = 48; +EXPORT_SYMBOL(pgdir_shift); +unsigned int ptrs_per_p4d __ro_after_init = 512; +EXPORT_SYMBOL(ptrs_per_p4d); #endif #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT @@ -336,7 +340,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a89f2dbc3531..420058b05d39 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -428,14 +428,15 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, #define p4d_none(a) pud_none(__pud(p4d_val(a))) #endif -#if PTRS_PER_P4D > 1 - static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; p4d_t *start, *p4d_start; pgprotval_t prot; + if (PTRS_PER_P4D == 1) + return walk_pud_level(m, st, __p4d(pgd_val(addr)), P); + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { @@ -455,11 +456,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } } -#else -#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) -#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) -#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) -#endif +#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) +#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) static inline bool is_hypervisor_range(int idx) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ab42c852069..6a4b20bc7527 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -143,7 +143,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) * With folded p4d, pgd_none() is always false, we need to * handle synchonization on p4d level. */ - BUILD_BUG_ON(pgd_none(*pgd_ref)); + MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); if (p4d_none(*p4d_ref)) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index af6f2f9c6a26..12ec90f62457 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -19,7 +19,7 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; -static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); +static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); static __init void *early_alloc(size_t size, int nid, bool panic) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 780460aa5ea5..d52aaa7dc088 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -257,8 +257,8 @@ void efi_sync_low_kernel_mappings(void) * only span a single PGD entry and that the entry also maps * other important kernel regions. */ - BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); - BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != + MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); + MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != (EFI_VA_END & PGDIR_MASK)); pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index dfbd9d990637..9c2e0708eb82 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -8,6 +8,7 @@ #define P4D_SHIFT PGDIR_SHIFT #define P4D_SIZE PGDIR_SIZE #define P4D_MASK PGDIR_MASK +#define MAX_PTRS_PER_P4D 1 #define PTRS_PER_P4D 1 #define p4d_t pgd_t diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index 8f22f55de17a..1a29b2a0282b 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -8,10 +8,11 @@ typedef struct { pgd_t pgd; } p4d_t; -#define P4D_SHIFT PGDIR_SHIFT -#define PTRS_PER_P4D 1 -#define P4D_SIZE (1UL << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE-1)) +#define P4D_SHIFT PGDIR_SHIFT +#define MAX_PTRS_PER_P4D 1 +#define PTRS_PER_P4D 1 +#define P4D_SIZE (1UL << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE-1)) /* * The "pgd_xxx()" functions here are trivial for a folded two-level diff --git a/include/linux/kasan.h b/include/linux/kasan.h index adc13474a53b..d6459bd1376d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -18,7 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE]; extern pte_t kasan_zero_pte[PTRS_PER_PTE]; extern pmd_t kasan_zero_pmd[PTRS_PER_PMD]; extern pud_t kasan_zero_pud[PTRS_PER_PUD]; -extern p4d_t kasan_zero_p4d[PTRS_PER_P4D]; +extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D]; void kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end); diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 554e4c0f23a2..f436246ccc79 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -31,7 +31,7 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 -p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss; +p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; #endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; -- cgit v1.2.3-55-g7522 From 162434e7f58b21f0b6c9cc5fb02222cd7d9064cc Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:54 +0300 Subject: x86/mm: Make MAX_PHYSADDR_BITS and MAX_PHYSMEM_BITS dynamic For boot-time switching between paging modes, we need to be able to adjust size of physical address space at runtime. As part of making physical address space size variable, we have to make X86_5LEVEL dependent on SPARSEMEM_VMEMMAP. !SPARSEMEM_VMEMMAP configuration doesn't build with variable MAX_PHYSMEM_BITS. For !SPARSEMEM_VMEMMAP SECTIONS_WIDTH depends on MAX_PHYSMEM_BITS: SECTIONS_WIDTH SECTIONS_SHIFT MAX_PHYSMEM_BITS And SECTIONS_WIDTH is used on pre-processor stage, it doesn't work if it's dyncamic. See include/linux/page-flags-layout.h. Effect on kernel image size: text data bss dec hex filename 8628393 4734340 1368064 14730797 e0c62d vmlinux.before 8628892 4734340 1368064 14731296 e0c820 vmlinux.after Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-8-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable_64_types.h | 2 +- arch/x86/include/asm/sparsemem.h | 9 ++------- arch/x86/kernel/setup.c | 5 ++--- 4 files changed, 6 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 92256489b8a4..fcc3f88996b3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1431,6 +1431,7 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" select DYNAMIC_MEMORY_LAYOUT + select SPARSEMEM_VMEMMAP depends on X86_64 ---help--- 5-level paging enables access to larger address space: diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 0c48d80e11d4..59d971c85de5 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -95,7 +95,7 @@ extern unsigned int ptrs_per_p4d; * range must not overlap with anything except the KASAN shadow area, which * is correct as KASAN disables KASLR. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define MAXMEM (1UL << MAX_PHYSMEM_BITS) #define LDT_PGD_ENTRY_L4 -3UL #define LDT_PGD_ENTRY_L5 -112UL diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 4fc1e9d3c43e..4617a2bf123c 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,13 +27,8 @@ # endif #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ -# ifdef CONFIG_X86_5LEVEL -# define MAX_PHYSADDR_BITS 52 -# define MAX_PHYSMEM_BITS 52 -# else -# define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 46 -# endif +# define MAX_PHYSADDR_BITS (pgtable_l5_enabled ? 52 : 44) +# define MAX_PHYSMEM_BITS (pgtable_l5_enabled ? 52 : 46) #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1ae67e982af7..399d0f7fa8f1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -189,9 +189,7 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .x86_phys_bits = MAX_PHYSMEM_BITS, -}; +struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); #endif @@ -851,6 +849,7 @@ void __init setup_arch(char **cmdline_p) __flush_tlb_all(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); + boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif /* -- cgit v1.2.3-55-g7522 From 09e61a779e7f171c50325e6d7108a593afb2e5d4 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 14:16:55 +0300 Subject: x86/mm: Make __VIRTUAL_MASK_SHIFT dynamic For boot-time switching between paging modes, we need to be able to adjust virtual mask shifts. The change doesn't affect the kernel image size much: text data bss dec hex filename 8628892 4734340 1368064 14731296 e0c820 vmlinux.before 8628966 4734340 1368064 14731370 e0c86a vmlinux.after Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-9-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 12 ++++++++++++ arch/x86/include/asm/page_64_types.h | 2 +- arch/x86/mm/dump_pagetables.c | 12 ++++++++++-- arch/x86/mm/kaslr.c | 4 +++- 4 files changed, 26 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 30c8c5344c4a..2c06348b7807 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -274,8 +274,20 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) * Change top bits to match most significant bit (47th or 56th bit * depending on paging mode) in the address. */ +#ifdef CONFIG_X86_5LEVEL + testl $1, pgtable_l5_enabled(%rip) + jz 1f + shl $(64 - 57), %rcx + sar $(64 - 57), %rcx + jmp 2f +1: + shl $(64 - 48), %rcx + sar $(64 - 48), %rcx +2: +#else shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx +#endif /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d54a3d5b5b3b..fa7dc7cd8c19 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -56,7 +56,7 @@ #define __PHYSICAL_MASK_SHIFT 52 #ifdef CONFIG_X86_5LEVEL -#define __VIRTUAL_MASK_SHIFT 56 +#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled ? 56 : 47) #else #define __VIRTUAL_MASK_SHIFT 47 #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 420058b05d39..9efee6f464ab 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -85,8 +85,12 @@ static struct addr_marker address_markers[] = { [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, #ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, - [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, + /* + * These fields get initialized with the (dynamic) + * KASAN_SHADOW_{START,END} values in pt_dump_init(). + */ + [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL [LDT_NR] = { 0UL, "LDT remap" }, @@ -571,6 +575,10 @@ static int __init pt_dump_init(void) #ifdef CONFIG_MODIFY_LDT_SYSCALL address_markers[LDT_NR].start_address = LDT_BASE_ADDR; #endif +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif #endif #ifdef CONFIG_X86_32 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 515b98a8ccee..d079878c6cbc 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region { unsigned long *base; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ }, + { &page_offset_base, 0 }, { &vmalloc_base, VMALLOC_SIZE_TB }, { &vmemmap_base, 1 }, }; @@ -93,6 +93,8 @@ void __init kernel_randomize_memory(void) if (!kaslr_memory_enabled()) return; + kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT); + /* * Update Physical memory mapping to available and * add padding if needed (especially for memory hotplug support). -- cgit v1.2.3-55-g7522 From 4c2b4058ab32581931c2caf760b689fd4b019a87 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:34 +0300 Subject: x86/mm: Initialize 'pgtable_l5_enabled' at boot-time 'pgtable_l5_enabled' indicates which paging mode we are using. We need to initialize it at boot-time according to machine capability. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 8 +++++++- arch/x86/kernel/head64.c | 24 +++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b18e8f9512de..d02a838c0ce4 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -47,7 +47,7 @@ #include #ifdef CONFIG_X86_5LEVEL -unsigned int pgtable_l5_enabled __ro_after_init = 1; +unsigned int pgtable_l5_enabled __ro_after_init; unsigned int pgdir_shift __ro_after_init = 48; unsigned int ptrs_per_p4d __ro_after_init = 512; #endif @@ -729,6 +729,12 @@ void choose_random_location(unsigned long input, return; } +#ifdef CONFIG_X86_5LEVEL + if (__read_cr4() & X86_CR4_LA57) { + pgtable_l5_enabled = 1; + } +#endif + boot_params->hdr.loadflags |= KASLR_FLAG; /* Prepare to add new identity pagetables on demand. */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 98b0ff49b220..ffb31c50d515 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,7 +40,7 @@ static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL -unsigned int pgtable_l5_enabled __ro_after_init = 1; +unsigned int pgtable_l5_enabled __ro_after_init; EXPORT_SYMBOL(pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 48; EXPORT_SYMBOL(pgdir_shift); @@ -64,6 +64,26 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } +#ifdef CONFIG_X86_5LEVEL +static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +static void __head check_la57_support(unsigned long physaddr) +{ + if (native_cpuid_eax(0) < 7) + return; + + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return; + + *fixup_int(&pgtable_l5_enabled, physaddr) = 1; +} +#else +static void __head check_la57_support(unsigned long physaddr) {} +#endif + unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { @@ -76,6 +96,8 @@ unsigned long __head __startup_64(unsigned long physaddr, int i; unsigned int *next_pgt_ptr; + check_la57_support(physaddr); + /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) for (;;); -- cgit v1.2.3-55-g7522 From b16e770bfa5344f1cd4f7b4ecd7bbae25001e120 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:35 +0300 Subject: x86/mm: Initialize 'pgdir_shift' and 'ptrs_per_p4d' at boot-time Switching between paging modes requires the folding of the p4d page table level when we only have 4 paging levels, which means we need to adjust 'pgdir_shift' and 'ptrs_per_p4d' during early boot according to paging mode. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 6 ++++-- arch/x86/kernel/head64.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index d02a838c0ce4..66e42a098d70 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -48,8 +48,8 @@ #ifdef CONFIG_X86_5LEVEL unsigned int pgtable_l5_enabled __ro_after_init; -unsigned int pgdir_shift __ro_after_init = 48; -unsigned int ptrs_per_p4d __ro_after_init = 512; +unsigned int pgdir_shift __ro_after_init = 39; +unsigned int ptrs_per_p4d __ro_after_init = 1; #endif extern unsigned long get_cmd_line_ptr(void); @@ -732,6 +732,8 @@ void choose_random_location(unsigned long input, #ifdef CONFIG_X86_5LEVEL if (__read_cr4() & X86_CR4_LA57) { pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index ffb31c50d515..8a0a485524da 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -42,9 +42,9 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL unsigned int pgtable_l5_enabled __ro_after_init; EXPORT_SYMBOL(pgtable_l5_enabled); -unsigned int pgdir_shift __ro_after_init = 48; +unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); -unsigned int ptrs_per_p4d __ro_after_init = 512; +unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); #endif @@ -79,6 +79,8 @@ static void __head check_la57_support(unsigned long physaddr) return; *fixup_int(&pgtable_l5_enabled, physaddr) = 1; + *fixup_int(&pgdir_shift, physaddr) = 48; + *fixup_int(&ptrs_per_p4d, physaddr) = 512; } #else static void __head check_la57_support(unsigned long physaddr) {} -- cgit v1.2.3-55-g7522 From 4fa5662b6b49611f11856db8be346710217473ef Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:36 +0300 Subject: x86/mm: Initialize 'page_offset_base' at boot-time For 4- and 5-level paging we have different 'page_offset_base'. Let's initialize it at boot-time accordingly to machine capability. We also have to split __PAGE_OFFSET_BASE into two constants -- for 4- and 5-level paging. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_64_types.h | 9 +++------ arch/x86/kernel/head64.c | 13 +++++++++---- arch/x86/kernel/head_64.S | 2 +- arch/x86/mm/kaslr.c | 8 ++++---- 4 files changed, 17 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index fa7dc7cd8c19..2c5a966dc222 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -37,16 +37,13 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ -#ifdef CONFIG_X86_5LEVEL -#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL) -#else -#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) -#endif +#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL) +#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL) #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT #define __PAGE_OFFSET page_offset_base #else -#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8a0a485524da..876d3bf2b23a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -49,7 +49,7 @@ EXPORT_SYMBOL(ptrs_per_p4d); #endif #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT -unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE; +unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE; EXPORT_SYMBOL(vmalloc_base); @@ -64,6 +64,11 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } +static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + #ifdef CONFIG_X86_5LEVEL static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) { @@ -81,6 +86,7 @@ static void __head check_la57_support(unsigned long physaddr) *fixup_int(&pgtable_l5_enabled, physaddr) = 1; *fixup_int(&pgdir_shift, physaddr) = 48; *fixup_int(&ptrs_per_p4d, physaddr) = 512; + *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; } #else static void __head check_la57_support(unsigned long physaddr) {} @@ -89,7 +95,7 @@ static void __head check_la57_support(unsigned long physaddr) {} unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { - unsigned long load_delta, *p; + unsigned long load_delta; unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; @@ -196,8 +202,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * Fixup phys_base - remove the memory encryption mask to obtain * the true physical address. */ - p = fixup_pointer(&phys_base, physaddr); - *p += load_delta - sme_get_me_mask(); + *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); /* Encrypt the kernel and related (if SME is active) */ sme_encrypt_kernel(bp); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 04a625f0fcda..d3f8b43d541a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -41,7 +41,7 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) +PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE_L4) PGD_START_KERNEL = pgd_index(__START_KERNEL_map) #endif L3_START_KERNEL = pud_index(__START_KERNEL_map) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index d079878c6cbc..7828a7ca3bba 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,13 +34,10 @@ #define TB_SHIFT 40 /* - * Virtual address start and end range for randomization. - * * The end address could depend on more configuration options to make the * highest amount of space for randomization available, but that's too hard * to keep straight and caused issues already. */ -static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; /* @@ -76,11 +73,14 @@ static inline bool kaslr_memory_enabled(void) void __init kernel_randomize_memory(void) { size_t i; - unsigned long vaddr = vaddr_start; + unsigned long vaddr_start, vaddr; unsigned long rand, memory_tb; struct rnd_state rand_state; unsigned long remain_entropy; + vaddr_start = pgtable_l5_enabled ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; + vaddr = vaddr_start; + /* * These BUILD_BUG_ON checks ensure the memory layout is consistent * with the vaddr_start/vaddr_end variables. These checks are very -- cgit v1.2.3-55-g7522 From a7412546d8cb5ad578805060b4006f2a021b5868 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:37 +0300 Subject: x86/mm: Adjust vmalloc base and size at boot-time vmalloc area has different placement and size depending on paging mode. Let's adjust it during early boot accodring to machine capability. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64_types.h | 16 ++++++++++------ arch/x86/kernel/head64.c | 3 ++- arch/x86/mm/kaslr.c | 3 ++- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 59d971c85de5..686329994ade 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -102,25 +102,29 @@ extern unsigned int ptrs_per_p4d; #define LDT_PGD_ENTRY (pgtable_l5_enabled ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) +#define __VMALLOC_BASE_L4 0xffffc90000000000 +#define __VMALLOC_BASE_L5 0xffa0000000000000 + +#define VMALLOC_SIZE_TB_L4 32UL +#define VMALLOC_SIZE_TB_L5 12800UL + #ifdef CONFIG_X86_5LEVEL -# define VMALLOC_SIZE_TB _AC(12800, UL) -# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) #else -# define VMALLOC_SIZE_TB _AC(32, UL) -# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) #endif #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base +# define VMALLOC_SIZE_TB (pgtable_l5_enabled ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) # define VMEMMAP_START vmemmap_base #else -# define VMALLOC_START __VMALLOC_BASE +# define VMALLOC_START __VMALLOC_BASE_L4 +# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4 # define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ -#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) +#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 876d3bf2b23a..22bf2015254c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(ptrs_per_p4d); #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); -unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE; +unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; EXPORT_SYMBOL(vmalloc_base); unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE; EXPORT_SYMBOL(vmemmap_base); @@ -87,6 +87,7 @@ static void __head check_la57_support(unsigned long physaddr) *fixup_int(&pgdir_shift, physaddr) = 48; *fixup_int(&ptrs_per_p4d, physaddr) = 512; *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; + *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; } #else static void __head check_la57_support(unsigned long physaddr) {} diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 7828a7ca3bba..641169d38184 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -50,7 +50,7 @@ static __initdata struct kaslr_memory_region { unsigned long size_tb; } kaslr_regions[] = { { &page_offset_base, 0 }, - { &vmalloc_base, VMALLOC_SIZE_TB }, + { &vmalloc_base, 0 }, { &vmemmap_base, 1 }, }; @@ -94,6 +94,7 @@ void __init kernel_randomize_memory(void) return; kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT); + kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; /* * Update Physical memory mapping to available and -- cgit v1.2.3-55-g7522 From 9b46a051e43461a9afda2bdd50e0e0ae349341df Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:38 +0300 Subject: x86/mm: Initialize vmemmap_base at boot-time vmemmap area has different placement depending on paging mode. Let's adjust it during early boot accodring to machine capability. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64_types.h | 9 +++------ arch/x86/kernel/head64.c | 3 ++- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 686329994ade..68909a68e5b9 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -108,11 +108,8 @@ extern unsigned int ptrs_per_p4d; #define VMALLOC_SIZE_TB_L4 32UL #define VMALLOC_SIZE_TB_L5 12800UL -#ifdef CONFIG_X86_5LEVEL -# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) -#else -# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -#endif +#define __VMEMMAP_BASE_L4 0xffffea0000000000 +#define __VMEMMAP_BASE_L5 0xffd4000000000000 #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base @@ -121,7 +118,7 @@ extern unsigned int ptrs_per_p4d; #else # define VMALLOC_START __VMALLOC_BASE_L4 # define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4 -# define VMEMMAP_START __VMEMMAP_BASE +# define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 22bf2015254c..795e762f3c66 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -53,7 +53,7 @@ unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; EXPORT_SYMBOL(vmalloc_base); -unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE; +unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif @@ -88,6 +88,7 @@ static void __head check_la57_support(unsigned long physaddr) *fixup_int(&ptrs_per_p4d, physaddr) = 512; *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; + *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; } #else static void __head check_la57_support(unsigned long physaddr) {} -- cgit v1.2.3-55-g7522 From 6f9dd329717f696f578347c0781a0247db957596 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:39 +0300 Subject: x86/mm: Support boot-time switching of paging modes in the early boot code Early boot code should be able to initialize page tables for both 4- and 5-level paging modes. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-7-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 33 ++++++++++++++++++++++----------- arch/x86/kernel/head_64.S | 10 ++++------ 2 files changed, 26 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 795e762f3c66..8161e719a20f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -75,13 +75,13 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) return fixup_pointer(ptr, physaddr); } -static void __head check_la57_support(unsigned long physaddr) +static bool __head check_la57_support(unsigned long physaddr) { if (native_cpuid_eax(0) < 7) - return; + return false; if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) - return; + return false; *fixup_int(&pgtable_l5_enabled, physaddr) = 1; *fixup_int(&pgdir_shift, physaddr) = 48; @@ -89,24 +89,30 @@ static void __head check_la57_support(unsigned long physaddr) *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; + + return true; } #else -static void __head check_la57_support(unsigned long physaddr) {} +static bool __head check_la57_support(unsigned long physaddr) +{ + return false; +} #endif unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { - unsigned long load_delta; + unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; + bool la57; int i; unsigned int *next_pgt_ptr; - check_la57_support(physaddr); + la57 = check_la57_support(physaddr); /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) @@ -131,9 +137,14 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); - pgd[pgd_index(__START_KERNEL_map)] += load_delta; - - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p = pgd + pgd_index(__START_KERNEL_map); + if (la57) + *p = (unsigned long)level4_kernel_pgt; + else + *p = (unsigned long)level3_kernel_pgt; + *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + + if (la57) { p4d = fixup_pointer(&level4_kernel_pgt, physaddr); p4d[511] += load_delta; } @@ -158,7 +169,7 @@ unsigned long __head __startup_64(unsigned long physaddr, pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (la57) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; @@ -255,7 +266,7 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) p4d_p = pgd_p; else if (pgd) p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d3f8b43d541a..145d7b95ae29 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -124,7 +124,10 @@ ENTRY(secondary_startup_64) /* Enable PAE mode, PGE and LA57 */ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL + testl $1, pgtable_l5_enabled(%rip) + jz 1f orl $X86_CR4_LA57, %ecx +1: #endif movq %rcx, %cr4 @@ -372,12 +375,7 @@ GLOBAL(name) __INITDATA NEXT_PGD_PAGE(early_top_pgt) - .fill 511,8,0 -#ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC -#else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC -#endif + .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) -- cgit v1.2.3-55-g7522 From 98219dda2ab56ce2a967fdebf81e838d676d9ddc Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:40 +0300 Subject: x86/mm: Fold p4d page table layer at runtime Change page table helpers to fold p4d at runtime. The logic is the same as in . Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-8-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 10 ++++++---- arch/x86/include/asm/pgalloc.h | 5 ++++- arch/x86/include/asm/pgtable.h | 11 ++++++++++- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 892df375b615..3fbaad238a94 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -569,14 +569,16 @@ static inline p4dval_t p4d_val(p4d_t p4d) static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { - pgdval_t val = native_pgd_val(pgd); - - PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val); + if (pgtable_l5_enabled) + PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd)); + else + set_p4d((p4d_t *)(pgdp), (p4d_t) { pgd.pgd }); } static inline void pgd_clear(pgd_t *pgdp) { - set_pgd(pgdp, __pgd(0)); + if (pgtable_l5_enabled) + set_pgd(pgdp, __pgd(0)); } #endif /* CONFIG_PGTABLE_LEVELS == 5 */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index aff42e1da6ee..263c142a6a6c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -167,6 +167,8 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #if CONFIG_PGTABLE_LEVELS > 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { + if (!pgtable_l5_enabled) + return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } @@ -191,7 +193,8 @@ extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - ___p4d_free_tlb(tlb, p4d); + if (pgtable_l5_enabled) + ___p4d_free_tlb(tlb, p4d); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 63c2552b6b65..c8baa7f12d1b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -65,7 +65,7 @@ extern pmdval_t early_pmd_flags; #ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) -#define pgd_clear(pgd) native_pgd_clear(pgd) +#define pgd_clear(pgd) (pgtable_l5_enabled ? native_pgd_clear(pgd) : 0) #endif #ifndef set_p4d @@ -859,6 +859,8 @@ static inline unsigned long p4d_index(unsigned long address) #if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { + if (!pgtable_l5_enabled) + return 1; return pgd_flags(pgd) & _PAGE_PRESENT; } @@ -876,6 +878,8 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { + if (!pgtable_l5_enabled) + return (p4d_t *)pgd; return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } @@ -883,6 +887,9 @@ static inline int pgd_bad(pgd_t pgd) { unsigned long ignore_flags = _PAGE_USER; + if (!pgtable_l5_enabled) + return 0; + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; @@ -891,6 +898,8 @@ static inline int pgd_bad(pgd_t pgd) static inline int pgd_none(pgd_t pgd) { + if (!pgtable_l5_enabled) + return 0; /* * There is no need to do a workaround for the KNL stray * A/D bit erratum here. PGDs only point to page tables -- cgit v1.2.3-55-g7522 From 91f606a8fa68264224cbc76888fa8649cdbe9990 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:41 +0300 Subject: x86/mm: Replace compile-time checks for 5-level paging with runtime-time checks This patch converts the of CONFIG_X86_5LEVEL check to runtime checks for p4d folding. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-9-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 23 ++++++++++------------- arch/x86/mm/dump_pagetables.c | 4 +--- arch/x86/mm/fault.c | 4 ++-- arch/x86/mm/ident_map.c | 2 +- arch/x86/mm/init_64.c | 30 ++++++++++++++++++------------ arch/x86/mm/kasan_init_64.c | 12 ++++++------ arch/x86/mm/kaslr.c | 6 +++--- arch/x86/mm/tlb.c | 2 +- arch/x86/platform/efi/efi_64.c | 2 +- arch/x86/power/hibernate_64.c | 6 +++--- 10 files changed, 46 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 81462e9a34f6..81dda8d1d0bd 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -217,29 +217,26 @@ static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { -#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) - p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); -#else - *p4dp = p4d; -#endif + pgd_t pgd; + + if (pgtable_l5_enabled || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + *p4dp = p4d; + return; + } + + pgd = native_make_pgd(p4d_val(p4d)); + pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); + *p4dp = native_make_p4d(pgd_val(pgd)); } static inline void native_p4d_clear(p4d_t *p4d) { -#ifdef CONFIG_X86_5LEVEL native_set_p4d(p4d, native_make_p4d(0)); -#else - native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); -#endif } static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION *pgdp = pti_set_user_pgd(pgdp, pgd); -#else - *pgdp = pgd; -#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 9efee6f464ab..0d6d67d18ad6 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -348,9 +348,7 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, void *pt) { if (__pa(pt) == __pa(kasan_zero_pmd) || -#ifdef CONFIG_X86_5LEVEL - __pa(pt) == __pa(kasan_zero_p4d) || -#endif + (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) || __pa(pt) == __pa(kasan_zero_pud)) { pgprotval_t prot = pte_flags(kasan_zero_pte[0]); note_page(m, st, __pgprot(prot), 5); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 800de815519c..321b78060e93 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -439,7 +439,7 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_ref)) return -1; - if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgtable_l5_enabled) { if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); @@ -454,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address) if (p4d_none(*p4d_ref)) return -1; - if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) { + if (p4d_none(*p4d) && !pgtable_l5_enabled) { set_p4d(p4d, *p4d_ref); arch_flush_lazy_mmu_mode(); } else { diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ab33a32df2a8..9aa22be8331e 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -120,7 +120,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6a4b20bc7527..3186e6836036 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -88,12 +88,7 @@ static int __init nonx32_setup(char *str) } __setup("noexec32=", nonx32_setup); -/* - * When memory was added make sure all the processes MM have - * suitable PGD entries in the local PGD level page. - */ -#ifdef CONFIG_X86_5LEVEL -void sync_global_pgds(unsigned long start, unsigned long end) +static void sync_global_pgds_l5(unsigned long start, unsigned long end) { unsigned long addr; @@ -129,8 +124,8 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } -#else -void sync_global_pgds(unsigned long start, unsigned long end) + +static void sync_global_pgds_l4(unsigned long start, unsigned long end) { unsigned long addr; @@ -173,7 +168,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } -#endif + +/* + * When memory was added make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + if (pgtable_l5_enabled) + sync_global_pgds_l5(start, end); + else + sync_global_pgds_l4(start, end); +} /* * NOTE: This function is marked __ref because it calls __init function @@ -632,7 +638,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, unsigned long vaddr = (unsigned long)__va(paddr); int i = p4d_index(vaddr); - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { @@ -712,7 +718,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) pgd_populate(&init_mm, pgd, p4d); else p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); @@ -1093,7 +1099,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, * 5-level case we should free them. This code will have to change * to adapt for boot-time switching between 4 and 5 level page tables. */ - if (CONFIG_PGTABLE_LEVELS == 5) + if (pgtable_l5_enabled) free_pud_table(pud_base, p4d, altmap); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 12ec90f62457..0df0dd13a71d 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -176,10 +176,10 @@ static void __init clear_pgds(unsigned long start, * With folded p4d, pgd_clear() is nop, use p4d_clear() * instead. */ - if (CONFIG_PGTABLE_LEVELS < 5) - p4d_clear(p4d_offset(pgd, start)); - else + if (pgtable_l5_enabled) pgd_clear(pgd); + else + p4d_clear(p4d_offset(pgd, start)); } pgd = pgd_offset_k(start); @@ -191,7 +191,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) { unsigned long p4d; - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + if (!pgtable_l5_enabled) return (p4d_t *)pgd; p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; @@ -272,7 +272,7 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); - for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) + for (i = 0; pgtable_l5_enabled && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); kasan_map_early_shadow(early_top_pgt); @@ -303,7 +303,7 @@ void __init kasan_init(void) * bunch of things like kernel code, modules, EFI mapping, etc. * We need to take extra steps to not overwrite them. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { void *ptr; ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 641169d38184..615cc03ced84 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -124,7 +124,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) entropy = (rand % (entropy + 1)) & P4D_MASK; else entropy = (rand % (entropy + 1)) & PUD_MASK; @@ -136,7 +136,7 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) vaddr = round_up(vaddr + 1, P4D_SIZE); else vaddr = round_up(vaddr + 1, PUD_SIZE); @@ -212,7 +212,7 @@ void __meminit init_trampoline(void) return; } - if (IS_ENABLED(CONFIG_X86_5LEVEL)) + if (pgtable_l5_enabled) init_trampoline_p4d(); else init_trampoline_pud(); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6550d37d0f06..92cb8a901c36 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -157,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) unsigned long sp = current_stack_pointer; pgd_t *pgd = pgd_offset(mm, sp); - if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgtable_l5_enabled) { if (unlikely(pgd_none(*pgd))) { pgd_t *pgd_ref = pgd_offset_k(sp); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index d52aaa7dc088..4845871a2006 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -227,7 +227,7 @@ int __init efi_alloc_page_tables(void) pud = pud_alloc(&init_mm, p4d, EFI_VA_END); if (!pud) { - if (CONFIG_PGTABLE_LEVELS > 4) + if (pgtable_l5_enabled) free_page((unsigned long) pgd_page_vaddr(*pgd)); free_page((unsigned long)efi_pgd); return -ENOMEM; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 0ef5e5204968..74a532989308 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -50,7 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; - p4d_t *p4d; + p4d_t *p4d = NULL; /* * The new mapping only has to cover the page containing the image @@ -66,7 +66,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * tables used by the image kernel. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (pgtable_l5_enabled) { p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); if (!p4d) return -ENOMEM; @@ -84,7 +84,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + if (p4d) { set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); } else { -- cgit v1.2.3-55-g7522 From 6657fca06e3ffab8d0b3f9d8b397f5ee498952d7 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Wed, 14 Feb 2018 21:25:42 +0300 Subject: x86/mm: Allow to boot without LA57 if CONFIG_X86_5LEVEL=y All pieces of the puzzle are in place and we can now allow to boot with CONFIG_X86_5LEVEL=y on a machine without LA57 support. Kernel will detect that LA57 is missing and fold p4d at runtime. Update the documentation and the Kconfig option description to reflect the change. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214182542.69302-10-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/5level-paging.txt | 9 +++------ arch/x86/Kconfig | 4 ++-- arch/x86/boot/compressed/misc.c | 16 ---------------- arch/x86/include/asm/required-features.h | 8 +------- 4 files changed, 6 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt index 087251a0d99c..2432a5ef86d9 100644 --- a/Documentation/x86/x86_64/5level-paging.txt +++ b/Documentation/x86/x86_64/5level-paging.txt @@ -20,12 +20,9 @@ Documentation/x86/x86_64/mm.txt CONFIG_X86_5LEVEL=y enables the feature. -So far, a kernel compiled with the option enabled will be able to boot -only on machines that supports the feature -- see for 'la57' flag in -/proc/cpuinfo. - -The plan is to implement boot-time switching between 4- and 5-level paging -in the future. +Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware. +In this case additional page table level -- p4d -- will be folded at +runtime. == User-space and large virtual address space == diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fcc3f88996b3..1c4f7b6a94f4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1440,8 +1440,8 @@ config X86_5LEVEL It will be supported by future Intel CPUs. - Note: a kernel with this option enabled can only be booted - on machines that support the feature. + A kernel with the option enabled can be booted on machines that + support 4- or 5-level paging. See Documentation/x86/x86_64/5level-paging.txt for more information. diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 98761a1576ce..b50c42455e25 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,16 +169,6 @@ void __puthex(unsigned long value) } } -static bool l5_supported(void) -{ - /* Check if leaf 7 is supported. */ - if (native_cpuid_eax(0) < 7) - return 0; - - /* Check if la57 is supported. */ - return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -372,12 +362,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); - if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { - error("This linux kernel as configured requires 5-level paging\n" - "This CPU does not support the required 'cr4.la57' feature\n" - "Unable to boot - please use a kernel appropriate for your CPU\n"); - } - free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fb3a6de7440b..6847d85400a8 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -53,12 +53,6 @@ # define NEED_MOVBE 0 #endif -#ifdef CONFIG_X86_5LEVEL -# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#else -# define NEED_LA57 0 -#endif - #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT /* Paravirtualized systems may not have PSE or PGE available */ @@ -104,7 +98,7 @@ #define REQUIRED_MASK13 0 #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 (NEED_LA57) +#define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) -- cgit v1.2.3-55-g7522 From b9952ec78778aa7ae5b8df672668aece6fc93d2a Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Fri, 16 Feb 2018 14:49:46 +0300 Subject: x86/xen: Allow XEN_PV and XEN_PVH to be enabled with X86_5LEVEL With boot-time switching between paging modes, XEN_PV and XEN_PVH can be boot into 4-level paging mode. Tested-by: Juergen Gross Signed-off-by: Kirill A. Shutemov Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180216114948.68868-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 12 ++++++------ arch/x86/xen/Kconfig | 5 ----- arch/x86/xen/mmu_pv.c | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 145d7b95ae29..3e9de0fc97de 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,12 +38,12 @@ * */ +#define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE_L4) -PGD_START_KERNEL = pgd_index(__START_KERNEL_map) -#endif +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) + L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -386,9 +386,9 @@ NEXT_PAGE(early_dynamic_pgts) #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC - .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 + .org init_top_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC - .org init_top_pgt + PGD_START_KERNEL*8, 0 + .org init_top_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index f605825a04ab..c1f98f32c45f 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -18,9 +18,6 @@ config XEN_PV bool "Xen PV guest support" default y depends on XEN - # XEN_PV is not ready to work with 5-level paging. - # Changes to hypervisor are also required. - depends on !X86_5LEVEL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU help @@ -79,6 +76,4 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" depends on XEN && XEN_PVHVM && ACPI - # Pre-built page tables are not ready to handle 5-level paging. - depends on !X86_5LEVEL def_bool n diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d85076223a69..3f4fec59af09 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -538,6 +538,22 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) xen_mc_issue(PARAVIRT_LAZY_MMU); } + +#if CONFIG_PGTABLE_LEVELS >= 5 +__visible p4dval_t xen_p4d_val(p4d_t p4d) +{ + return pte_mfn_to_pfn(p4d.p4d); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val); + +__visible p4d_t xen_make_p4d(p4dval_t p4d) +{ + p4d = pte_pfn_to_mfn(p4d); + + return native_make_p4d(p4d); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d); +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ #endif /* CONFIG_X86_64 */ static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, @@ -2411,6 +2427,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), + .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), +#endif #endif /* CONFIG_X86_64 */ .activate_mm = xen_activate_mm, -- cgit v1.2.3-55-g7522 From 92e1c5b3f7bf5407cfdbf13613e7101831216dc5 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Fri, 16 Feb 2018 14:49:47 +0300 Subject: x86/mm: Redefine some of page table helpers as macros This is preparation for the next patch, which would change pgtable_l5_enabled to be cpu_feature_enabled(X86_FEATURE_LA57). The change makes few helpers in paravirt.h dependent on cpu_feature_enabled() definition from cpufeature.h. And cpufeature.h is dependent on paravirt.h. Let's re-define some of helpers as macros to break this dependency loop. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180216114948.68868-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 3fbaad238a94..2c0c8c9e9516 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -567,19 +567,22 @@ static inline p4dval_t p4d_val(p4d_t p4d) return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); } -static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) +static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { - if (pgtable_l5_enabled) - PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd)); - else - set_p4d((p4d_t *)(pgdp), (p4d_t) { pgd.pgd }); + PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd)); } -static inline void pgd_clear(pgd_t *pgdp) -{ - if (pgtable_l5_enabled) - set_pgd(pgdp, __pgd(0)); -} +#define set_pgd(pgdp, pgdval) do { \ + if (pgtable_l5_enabled) \ + __set_pgd(pgdp, pgdval); \ + else \ + set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ +} while (0) + +#define pgd_clear(pgdp) do { \ + if (pgtable_l5_enabled) \ + set_pgd(pgdp, __pgd(0)); \ +} while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ -- cgit v1.2.3-55-g7522 From 39b9552281abfcdfc54162897018890dafe7ffef Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Fri, 16 Feb 2018 14:49:48 +0300 Subject: x86/mm: Optimize boot-time paging mode switching cost By this point we have functioning boot-time switching between 4- and 5-level paging mode. But naive approach comes with cost. Numbers below are for kernel build, allmodconfig, 5 times. CONFIG_X86_5LEVEL=n: Performance counter stats for 'sh -c make -j100 -B -k >/dev/null' (5 runs): 17308719.892691 task-clock:u (msec) # 26.772 CPUs utilized ( +- 0.11% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 331,993,164 page-faults:u # 0.019 M/sec ( +- 0.01% ) 43,614,978,867,455 cycles:u # 2.520 GHz ( +- 0.01% ) 39,371,534,575,126 stalled-cycles-frontend:u # 90.27% frontend cycles idle ( +- 0.09% ) 28,363,350,152,428 instructions:u # 0.65 insn per cycle # 1.39 stalled cycles per insn ( +- 0.00% ) 6,316,784,066,413 branches:u # 364.948 M/sec ( +- 0.00% ) 250,808,144,781 branch-misses:u # 3.97% of all branches ( +- 0.01% ) 646.531974142 seconds time elapsed ( +- 1.15% ) CONFIG_X86_5LEVEL=y: Performance counter stats for 'sh -c make -j100 -B -k >/dev/null' (5 runs): 17411536.780625 task-clock:u (msec) # 26.426 CPUs utilized ( +- 0.10% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 331,868,663 page-faults:u # 0.019 M/sec ( +- 0.01% ) 43,865,909,056,301 cycles:u # 2.519 GHz ( +- 0.01% ) 39,740,130,365,581 stalled-cycles-frontend:u # 90.59% frontend cycles idle ( +- 0.05% ) 28,363,358,997,959 instructions:u # 0.65 insn per cycle # 1.40 stalled cycles per insn ( +- 0.00% ) 6,316,784,937,460 branches:u # 362.793 M/sec ( +- 0.00% ) 251,531,919,485 branch-misses:u # 3.98% of all branches ( +- 0.00% ) 658.886307752 seconds time elapsed ( +- 0.92% ) The patch tries to fix the performance regression by using cpu_feature_enabled(X86_FEATURE_LA57) instead of pgtable_l5_enabled in all hot code paths. These will statically patch the target code for additional performance. CONFIG_X86_5LEVEL=y + the patch: Performance counter stats for 'sh -c make -j100 -B -k >/dev/null' (5 runs): 17381990.268506 task-clock:u (msec) # 26.907 CPUs utilized ( +- 0.19% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 331,862,625 page-faults:u # 0.019 M/sec ( +- 0.01% ) 43,697,726,320,051 cycles:u # 2.514 GHz ( +- 0.03% ) 39,480,408,690,401 stalled-cycles-frontend:u # 90.35% frontend cycles idle ( +- 0.05% ) 28,363,394,221,388 instructions:u # 0.65 insn per cycle # 1.39 stalled cycles per insn ( +- 0.00% ) 6,316,794,985,573 branches:u # 363.410 M/sec ( +- 0.00% ) 251,013,232,547 branch-misses:u # 3.97% of all branches ( +- 0.01% ) 645.991174661 seconds time elapsed ( +- 1.19% ) Unfortunately, this approach doesn't help with text size: vmlinux.before .text size: 8190319 vmlinux.after .text size: 8200623 The .text section is increased by about 4k. Not sure if we can do anything about this. Signed-off-by: Kirill A. Shuemov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180216114948.68868-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.h | 5 +++++ arch/x86/entry/entry_64.S | 11 ++--------- arch/x86/include/asm/pgtable_64_types.h | 5 ++++- arch/x86/kernel/head64.c | 9 +++++++-- arch/x86/kernel/head_64.S | 2 +- arch/x86/mm/kasan_init_64.c | 6 ++++++ 6 files changed, 25 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9d323dc6b159..4d369c308ed7 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -12,6 +12,11 @@ #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN +#ifdef CONFIG_X86_5LEVEL +/* cpu_feature_enabled() cannot be used that early */ +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + #include #include #include diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2c06348b7807..b18acdff9c3f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -275,15 +275,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) * depending on paging mode) in the address. */ #ifdef CONFIG_X86_5LEVEL - testl $1, pgtable_l5_enabled(%rip) - jz 1f - shl $(64 - 57), %rcx - sar $(64 - 57), %rcx - jmp 2f -1: - shl $(64 - 48), %rcx - sar $(64 - 48), %rcx -2: + ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ + "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 #else shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 68909a68e5b9..d5c21a382475 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -21,7 +21,10 @@ typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; #ifdef CONFIG_X86_5LEVEL -extern unsigned int pgtable_l5_enabled; +extern unsigned int __pgtable_l5_enabled; +#ifndef pgtable_l5_enabled +#define pgtable_l5_enabled cpu_feature_enabled(X86_FEATURE_LA57) +#endif #else #define pgtable_l5_enabled 0 #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8161e719a20f..0c855deee165 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -32,6 +32,11 @@ #include #include +#ifdef CONFIG_X86_5LEVEL +#undef pgtable_l5_enabled +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + /* * Manage page tables very early on. */ @@ -40,8 +45,8 @@ static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL -unsigned int pgtable_l5_enabled __ro_after_init; -EXPORT_SYMBOL(pgtable_l5_enabled); +unsigned int __pgtable_l5_enabled __ro_after_init; +EXPORT_SYMBOL(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3e9de0fc97de..326c63129417 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -124,7 +124,7 @@ ENTRY(secondary_startup_64) /* Enable PAE mode, PGE and LA57 */ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL - testl $1, pgtable_l5_enabled(%rip) + testl $1, __pgtable_l5_enabled(%rip) jz 1f orl $X86_CR4_LA57, %ecx 1: diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0df0dd13a71d..d8ff013ea9d0 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,6 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt + +#ifdef CONFIG_X86_5LEVEL +/* Too early to use cpu_feature_enabled() */ +#define pgtable_l5_enabled __pgtable_l5_enabled +#endif + #include #include #include -- cgit v1.2.3-55-g7522 From 038bac2b02989acf1fc938cedcb7944c02672b9f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 19 Feb 2018 11:09:05 +0100 Subject: x86/acpi: Add a new x86_init_acpi structure to x86_init_ops Add a new struct x86_init_acpi to x86_init_ops. For now it contains only one init function to get the RSDP table address. Signed-off-by: Juergen Gross Reviewed-by: Andy Shevchenko Acked-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Cc: Borislav Petkov Cc: Eric Biederman Cc: H. Peter Anvin Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: lenb@kernel.org Cc: linux-acpi@vger.kernel.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20180219100906.14265-3-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/acpi.h | 7 +++++++ arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/x86_init.c | 5 +++++ 3 files changed, 21 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 11881726ed37..6609dd7289b5 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI_APEI # include @@ -133,6 +134,12 @@ static inline bool acpi_has_cpu_in_madt(void) return !!acpi_lapic; } +#define ACPI_HAVE_ARCH_GET_ROOT_POINTER +static inline u64 acpi_arch_get_root_pointer(void) +{ + return x86_init.acpi.get_root_pointer(); +} + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index fc2f082ac635..2e2c34d2bb00 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -130,6 +130,14 @@ struct x86_hyper_init { void (*init_mem_mapping)(void); }; +/** + * struct x86_init_acpi - x86 ACPI init functions + * @get_root_pointer: get RSDP address + */ +struct x86_init_acpi { + u64 (*get_root_pointer)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -144,6 +152,7 @@ struct x86_init_ops { struct x86_init_iommu iommu; struct x86_init_pci pci; struct x86_hyper_init hyper; + struct x86_init_acpi acpi; }; /** diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1151ccd72ce9..9e4e994a4836 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -30,6 +30,7 @@ int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } +u64 u64_x86_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions @@ -91,6 +92,10 @@ struct x86_init_ops x86_init __initdata = { .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, }, + + .acpi = { + .get_root_pointer = u64_x86_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit = { -- cgit v1.2.3-55-g7522 From b17d9d1df3c33a4f1d2bf397e2257aecf9dc56d4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 19 Feb 2018 11:09:06 +0100 Subject: x86/xen: Add pvh specific rsdp address retrieval function Add pvh_get_root_pointer() for Xen PVH guests to communicate the address of the RSDP table given to the kernel via Xen start info. This makes the kernel boot again in PVH mode after on recent Xen the RSDP was moved to higher addresses. So up to that change it was pure luck that the legacy method to locate the RSDP was working when running as PVH mode. Signed-off-by: Juergen Gross Reviewed-by: Andy Shevchenko Acked-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Cc: Borislav Petkov Cc: Eric Biederman Cc: H. Peter Anvin Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: lenb@kernel.org Cc: linux-acpi@vger.kernel.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/20180219100906.14265-4-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten_pvh.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 436c4f003e17..aa1c6a6831a9 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -16,15 +17,20 @@ /* * PVH variables. * - * xen_pvh and pvh_bootparams need to live in data segment since they - * are used after startup_{32|64}, which clear .bss, are invoked. + * xen_pvh pvh_bootparams and pvh_start_info need to live in data segment + * since they are used after startup_{32|64}, which clear .bss, are invoked. */ bool xen_pvh __attribute__((section(".data"))) = 0; struct boot_params pvh_bootparams __attribute__((section(".data"))); +struct hvm_start_info pvh_start_info __attribute__((section(".data"))); -struct hvm_start_info pvh_start_info; unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +static u64 pvh_get_root_pointer(void) +{ + return pvh_start_info.rsdp_paddr; +} + static void __init init_pvh_bootparams(void) { struct xen_memory_map memmap; @@ -71,6 +77,8 @@ static void __init init_pvh_bootparams(void) */ pvh_bootparams.hdr.version = 0x212; pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ + + x86_init.acpi.get_root_pointer = pvh_get_root_pointer; } /* -- cgit v1.2.3-55-g7522 From c46dacb75cd59a50a2380dcba5e7edf4fde86845 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 21 Feb 2018 10:42:32 +0100 Subject: x86/boot: Make the x86_init noop functions static Make the noop functions in x86_init.c static in case they are used locally only. Reported-by: kbuild test robot Signed-off-by: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180221094232.23462-1-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/x86_init.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9e4e994a4836..b8cff22a8785 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,11 +26,11 @@ void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } -int __init iommu_init_noop(void) { return 0; } -void iommu_shutdown_noop(void) { } -bool __init bool_x86_init_noop(void) { return false; } -void x86_op_int_noop(int cpu) { } -u64 u64_x86_init_noop(void) { return 0; } +static int __init iommu_init_noop(void) { return 0; } +static void iommu_shutdown_noop(void) { } +static bool __init bool_x86_init_noop(void) { return false; } +static void x86_op_int_noop(int cpu) { } +static u64 u64_x86_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions -- cgit v1.2.3-55-g7522 From 672c0ae09b33a11d8f31fc61526632e96301164c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 23 Feb 2018 01:27:37 -0700 Subject: x86/mm: Consider effective protection attributes in W+X check Using just the leaf page table entry flags would cause a false warning in case _PAGE_RW is clear or _PAGE_NX is set in a higher level entry. Hand through both the current entry's flags as well as the accumulated effective value (the latter as pgprotval_t instead of pgprot_t, as it's not an actual entry's value). This in particular eliminates the false W+X warning when running under Xen, as commit: 2cc42bac1c ("x86-64/Xen: eliminate W+X mappings") had to make the necessary adjustment in L2 rather than L1 (the reason is explained there). I.e. _PAGE_RW is clear there in L1, but _PAGE_NX is set in L2. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5A8FDE8902000078001AABBB@prv-mh.provo.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 94 ++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0d6d67d18ad6..62a7e9f65dec 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -29,6 +29,7 @@ struct pg_state { int level; pgprot_t current_prot; + pgprotval_t effective_prot; unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; @@ -235,9 +236,9 @@ static unsigned long normalize_addr(unsigned long u) * print what we collected so far. */ static void note_page(struct seq_file *m, struct pg_state *st, - pgprot_t new_prot, int level) + pgprot_t new_prot, pgprotval_t new_eff, int level) { - pgprotval_t prot, cur; + pgprotval_t prot, cur, eff; static const char units[] = "BKMGTPE"; /* @@ -247,23 +248,24 @@ static void note_page(struct seq_file *m, struct pg_state *st, */ prot = pgprot_val(new_prot); cur = pgprot_val(st->current_prot); + eff = st->effective_prot; if (!st->level) { /* First entry */ st->current_prot = new_prot; + st->effective_prot = new_eff; st->level = level; st->marker = address_markers; st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || level != st->level || + } else if (prot != cur || new_eff != eff || level != st->level || st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; int width = sizeof(unsigned long) * 2; - pgprotval_t pr = pgprot_val(st->current_prot); - if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) { + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %p/%pS\n", (void *)st->start_address, @@ -317,21 +319,30 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->start_address = st->current_address; st->current_prot = new_prot; + st->effective_prot = new_eff; st->level = level; } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) +static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +{ + return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | + ((prot1 | prot2) & _PAGE_NX); +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pte_t *start; - pgprotval_t prot; + pgprotval_t prot, eff; start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); + eff = effective_prot(eff_in, prot); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 5); + note_page(m, st, __pgprot(prot), eff, 5); start++; } } @@ -351,7 +362,7 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, (pgtable_l5_enabled && __pa(pt) == __pa(kasan_zero_p4d)) || __pa(pt) == __pa(kasan_zero_pud)) { pgprotval_t prot = pte_flags(kasan_zero_pte[0]); - note_page(m, st, __pgprot(prot), 5); + note_page(m, st, __pgprot(prot), 0, 5); return true; } return false; @@ -366,42 +377,45 @@ static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pmd_t *start, *pmd_start; - pgprotval_t prot; + pgprotval_t prot, eff; pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { + prot = pmd_flags(*start); + eff = effective_prot(eff_in, prot); if (pmd_large(*start) || !pmd_present(*start)) { - prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), eff, 4); } else if (!kasan_page_table(m, st, pmd_start)) { - walk_pte_level(m, st, *start, + walk_pte_level(m, st, *start, eff, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 4); + note_page(m, st, __pgprot(0), 0, 4); start++; } } #else -#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) +#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p) #define pud_large(a) pmd_large(__pmd(pud_val(a))) #define pud_none(a) pmd_none(__pmd(pud_val(a))) #endif #if PTRS_PER_PUD > 1 -static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, + pgprotval_t eff_in, unsigned long P) { int i; pud_t *start, *pud_start; - pgprotval_t prot; + pgprotval_t prot, eff; pud_t *prev_pud = NULL; pud_start = start = (pud_t *)p4d_page_vaddr(addr); @@ -409,15 +423,16 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { + prot = pud_flags(*start); + eff = effective_prot(eff_in, prot); if (pud_large(*start) || !pud_present(*start)) { - prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 3); + note_page(m, st, __pgprot(prot), eff, 3); } else if (!kasan_page_table(m, st, pud_start)) { - walk_pmd_level(m, st, *start, + walk_pmd_level(m, st, *start, eff, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 0, 3); prev_pud = start; start++; @@ -425,34 +440,36 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p) #define p4d_large(a) pud_large(__pud(p4d_val(a))) #define p4d_none(a) pud_none(__pud(p4d_val(a))) #endif -static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + pgprotval_t eff_in, unsigned long P) { int i; p4d_t *start, *p4d_start; - pgprotval_t prot; + pgprotval_t prot, eff; if (PTRS_PER_P4D == 1) - return walk_pud_level(m, st, __p4d(pgd_val(addr)), P); + return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P); p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); if (!p4d_none(*start)) { + prot = p4d_flags(*start); + eff = effective_prot(eff_in, prot); if (p4d_large(*start) || !p4d_present(*start)) { - prot = p4d_flags(*start); - note_page(m, st, __pgprot(prot), 2); + note_page(m, st, __pgprot(prot), eff, 2); } else if (!kasan_page_table(m, st, p4d_start)) { - walk_pud_level(m, st, *start, + walk_pud_level(m, st, *start, eff, P + i * P4D_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 0, 2); start++; } @@ -483,7 +500,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, #else pgd_t *start = swapper_pg_dir; #endif - pgprotval_t prot; + pgprotval_t prot, eff; int i; struct pg_state st = {}; @@ -499,15 +516,20 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start) && !is_hypervisor_range(i)) { + prot = pgd_flags(*start); +#ifdef CONFIG_X86_PAE + eff = _PAGE_USER | _PAGE_RW; +#else + eff = prot; +#endif if (pgd_large(*start) || !pgd_present(*start)) { - prot = pgd_flags(*start); - note_page(m, &st, __pgprot(prot), 1); + note_page(m, &st, __pgprot(prot), eff, 1); } else { - walk_p4d_level(m, &st, *start, + walk_p4d_level(m, &st, *start, eff, i * PGD_LEVEL_MULT); } } else - note_page(m, &st, __pgprot(0), 1); + note_page(m, &st, __pgprot(0), 0, 1); cond_resched(); start++; @@ -515,7 +537,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, /* Flush out the last page */ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); - note_page(m, &st, __pgprot(0), 0); + note_page(m, &st, __pgprot(0), 0, 0); if (!checkwx) return; if (st.wx_pages) -- cgit v1.2.3-55-g7522 From ef61f8a340fd6d49df6b367785743febc47320c1 Mon Sep 17 00:00:00 2001 From: Jan H. Schönherr Date: Sat, 3 Feb 2018 00:10:20 +0100 Subject: x86/boot/e820: Implement a range manipulation operator Add a more versatile memmap= operator, which -- in addition to all the things that were possible before -- allows you to: - redeclare existing ranges -- before, you were limited to adding ranges; - drop any range -- like a mem= for any location; - use any e820 memory type -- not just some predefined ones. The syntax is: memmap=%-+ Size and offset work as usual. The "-" and "+" are optional and their existence determine the behavior: The command works on the specified range of memory limited to type (if specified). This memory is then configured to show up as . If is not specified, the memory is removed from the e820 map. Signed-off-by: Jan H. Schönherr Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180202231020.15608-1-jschoenh@amazon.de Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 9 +++++++++ arch/x86/kernel/e820.c | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1d1d53f85ddd..5529fa82700b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2237,6 +2237,15 @@ The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. + memmap=%-+ + [KNL,ACPI] Convert memory within the specified region + from to . If "-" is left + out, the whole region will be marked as , + even if previously unavailable. If "+" is left + out, matching memory will be removed. Types are + specified as e820 types, e.g., 1 = RAM, 2 = reserved, + 3 = ACPI, 12 = PRAM. + memory_corruption_check=0/1 [X86] Some BIOSes seem to corrupt the first 64k of memory when doing things like suspend/resume. diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 71c11ad5643e..6a2cb1442e05 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -924,6 +924,24 @@ static int __init parse_memmap_one(char *p) } else if (*p == '!') { start_at = memparse(p+1, &p); e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else if (*p == '%') { + enum e820_type from = 0, to = 0; + + start_at = memparse(p + 1, &p); + if (*p == '-') + from = simple_strtoull(p + 1, &p, 0); + if (*p == '+') + to = simple_strtoull(p + 1, &p, 0); + if (*p != '\0') + return -EINVAL; + if (from && to) + e820__range_update(start_at, mem_size, from, to); + else if (to) + e820__range_add(start_at, mem_size, to); + else if (from) + e820__range_remove(start_at, mem_size, from, 1); + else + e820__range_remove(start_at, mem_size, 0, 0); } else { e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); } -- cgit v1.2.3-55-g7522 From ae8d1d0061ad7996c2c5e769e809a593544fa145 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 26 Feb 2018 17:25:54 -0600 Subject: x86/mm/sme: Disable stack protection for mem_encrypt_identity.c Stack protection is not compatible with early boot code. All of the early SME boot code is now isolated in a separate file, mem_encrypt_identity.c, so arch/x86/mm/Makefile can be updated to turn off stack protection for the entire file. This eliminates the need to worry about other functions within the file being instrumented with stack protection (as was seen when a newer version of GCC instrumented sme_encrypt_kernel() where an older version hadn't). It also allows removal of the __nostackprotector attribute from individual functions. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Acked-by: Kirill A. Shutemov Cc: Linus Torvalds Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20180226232554.14108.16881.stgit@tlendack-t1.amdoffice.net --- arch/x86/mm/Makefile | 1 + arch/x86/mm/mem_encrypt_identity.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 03c6c8561623..4b101dd6e52f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -19,6 +19,7 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_physaddr.o := $(nostackp) CFLAGS_setup_nx.o := $(nostackp) +CFLAGS_mem_encrypt_identity.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index b4139c5ab972..1b2197d13832 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -266,7 +266,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return entries + tables; } -void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -468,7 +468,7 @@ void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __init __nostackprotector sme_enable(struct boot_params *bp) +void __init sme_enable(struct boot_params *bp) { const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; unsigned int eax, ebx, ecx, edx; -- cgit v1.2.3-55-g7522 From a403d798182f4f7be5e9bab56cfa37e9828fd92a Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 26 Feb 2018 21:04:47 +0300 Subject: x86/boot/compressed/64: Describe the logic behind the LA57 check The patch explains the LA57 check in more details. Tested-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Cyrill Gorcunov Cc: Eric Biederman Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180226180451.86788-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/pgtable_64.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 3f1697fcc7a8..45c76eff2718 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -18,10 +18,22 @@ struct paging_config paging_prepare(void) { struct paging_config paging_config = {}; - /* Check if LA57 is desired and supported */ - if (IS_ENABLED(CONFIG_X86_5LEVEL) && native_cpuid_eax(0) >= 7 && - (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + /* + * Check if LA57 is desired and supported. + * + * There are two parts to the check: + * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y + * - if the machine supports 5-level paging: + * + CPUID leaf 7 is supported + * + the leaf has the feature bit set + * + * That's substitute for boot_cpu_has() in early boot code. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL) && + native_cpuid_eax(0) >= 7 && + (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { paging_config.l5_required = 1; + } return paging_config; } -- cgit v1.2.3-55-g7522 From 3548e131ec6a82208f36e68d31947b0fe244c7a7 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 26 Feb 2018 21:04:48 +0300 Subject: x86/boot/compressed/64: Find a place for 32-bit trampoline If a bootloader enables 64-bit mode with 4-level paging, we might need to switch over to 5-level paging. The switching requires the disabling of paging, which works fine if kernel itself is loaded below 4G. But if the bootloader puts the kernel above 4G (not sure if anybody does this), we would lose control as soon as paging is disabled, because the code becomes unreachable to the CPU. To handle the situation, we need a trampoline in lower memory that would take care of switching on 5-level paging. This patch finds a spot in low memory for a trampoline. The heuristic is based on code in reserve_bios_regions(). We find the end of low memory based on BIOS and EBDA start addresses. The trampoline is put just before end of low memory. It's mimic approach taken to allocate memory for realtime trampoline. Tested-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Cyrill Gorcunov Cc: Eric Biederman Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180226180451.86788-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.c | 6 ++++++ arch/x86/boot/compressed/pgtable.h | 11 +++++++++++ arch/x86/boot/compressed/pgtable_64.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 arch/x86/boot/compressed/pgtable.h (limited to 'arch') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b50c42455e25..8e4b55dd5df9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include "misc.h" #include "error.h" +#include "pgtable.h" #include "../string.h" #include "../voffset.h" @@ -372,6 +373,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putaddr(output_len); debug_putaddr(kernel_total_size); +#ifdef CONFIG_X86_64 + /* Report address of 32-bit trampoline */ + debug_putaddr(trampoline_32bit); +#endif + /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h new file mode 100644 index 000000000000..57722a2fe2a0 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable.h @@ -0,0 +1,11 @@ +#ifndef BOOT_COMPRESSED_PAGETABLE_H +#define BOOT_COMPRESSED_PAGETABLE_H + +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#ifndef __ASSEMBLER__ + +extern unsigned long *trampoline_32bit; + +#endif /* __ASSEMBLER__ */ +#endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 45c76eff2718..21d5cc1cd5fa 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,4 +1,5 @@ #include +#include "pgtable.h" /* * __force_order is used by special_insns.h asm code to force instruction @@ -9,14 +10,27 @@ */ unsigned long __force_order; +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + struct paging_config { unsigned long trampoline_start; unsigned long l5_required; }; +/* + * Trampoline address will be printed by extract_kernel() for debugging + * purposes. + * + * Avoid putting the pointer into .bss as it will be cleared between + * paging_prepare() and extract_kernel(). + */ +unsigned long *trampoline_32bit __section(.data); + struct paging_config paging_prepare(void) { struct paging_config paging_config = {}; + unsigned long bios_start, ebda_start; /* * Check if LA57 is desired and supported. @@ -35,5 +49,25 @@ struct paging_config paging_prepare(void) paging_config.l5_required = 1; } + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + /* Place the trampoline just below the end of low memory, aligned to 4k */ + paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE; + paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE); + + trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + return paging_config; } -- cgit v1.2.3-55-g7522 From fb5268354d20b82c12569e325b0d051c09f983f7 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 26 Feb 2018 21:04:49 +0300 Subject: x86/boot/compressed/64: Save and restore trampoline memory The memory area we found for trampoline shouldn't contain anything useful. But let's preserve the data anyway. Just to be on safe side. paging_prepare() would save the data into a buffer. cleanup_trampoline() would restore it back once we are done with the trampoline. Tested-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Cyrill Gorcunov Cc: Eric Biederman Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180226180451.86788-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 10 ++++++++++ arch/x86/boot/compressed/pgtable_64.c | 13 +++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d598d65db32c..8ba0582c65d5 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -355,6 +355,16 @@ ENTRY(startup_64) lretq lvl5: + /* + * cleanup_trampoline() would restore trampoline memory. + * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi + call cleanup_trampoline + popq %rsi + /* Zero EFLAGS */ pushq $0 popfq diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 21d5cc1cd5fa..01d08d3e3e43 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,5 +1,6 @@ #include #include "pgtable.h" +#include "../string.h" /* * __force_order is used by special_insns.h asm code to force instruction @@ -18,6 +19,9 @@ struct paging_config { unsigned long l5_required; }; +/* Buffer to preserve trampoline memory */ +static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; + /* * Trampoline address will be printed by extract_kernel() for debugging * purposes. @@ -69,5 +73,14 @@ struct paging_config paging_prepare(void) trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + /* Preserve trampoline memory */ + memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); + return paging_config; } + +void cleanup_trampoline(void) +{ + /* Restore trampoline memory */ + memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); +} -- cgit v1.2.3-55-g7522 From 32fcefa2bfc8961987e91d1daeb00624b4176d2e Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 26 Feb 2018 21:04:50 +0300 Subject: x86/boot/compressed/64: Set up trampoline memory This patch clears up trampoline memory and copies trampoline code in place. It's not yet used though. Tested-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Cyrill Gorcunov Cc: Eric Biederman Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180226180451.86788-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 3 ++- arch/x86/boot/compressed/pgtable.h | 9 +++++++++ arch/x86/boot/compressed/pgtable_64.c | 7 +++++++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 8ba0582c65d5..c813cb004056 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -501,8 +501,9 @@ relocated: jmp *%rax .code32 +ENTRY(trampoline_32bit_src) compatible_mode: - /* Setup data and stack segments */ + /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %ss diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h index 57722a2fe2a0..91f75638f6e6 100644 --- a/arch/x86/boot/compressed/pgtable.h +++ b/arch/x86/boot/compressed/pgtable.h @@ -3,9 +3,18 @@ #define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) +#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0x60 + +#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE + #ifndef __ASSEMBLER__ extern unsigned long *trampoline_32bit; +extern void trampoline_32bit_src(void *return_ptr); + #endif /* __ASSEMBLER__ */ #endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 01d08d3e3e43..810c2c32d98e 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -76,6 +76,13 @@ struct paging_config paging_prepare(void) /* Preserve trampoline memory */ memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); + /* Clear trampoline memory first */ + memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); + + /* Copy trampoline code in place */ + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); + return paging_config; } -- cgit v1.2.3-55-g7522 From e9d0e6330eb81ca49bdd8849cc52b3b0f70ed5cb Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 26 Feb 2018 21:04:51 +0300 Subject: x86/boot/compressed/64: Prepare new top-level page table for trampoline If trampoline code would need to switch between 4- and 5-level paging modes, we have to use a page table in trampoline memory. Having it in trampoline memory guarantees that it's below 4G and we can point CR3 to it from 32-bit trampoline code. We only use the page table if the desired paging mode doesn't match the mode we are in. Otherwise the page table is unused and trampoline code wouldn't touch CR3. For 4- to 5-level paging transition, we set up current (4-level paging) CR3 as the first and the only entry in a new top-level page table. For 5- to 4-level paging transition, copy page table pointed by first entry in the current top-level page table as our new top-level page table. If the page table is used by trampoline we would need to copy it to new page table outside trampoline and update CR3 before restoring trampoline memory. Tested-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Cyrill Gorcunov Cc: Eric Biederman Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180226180451.86788-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/pgtable_64.c | 61 +++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'arch') diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 810c2c32d98e..32af1cbcd903 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -22,6 +22,14 @@ struct paging_config { /* Buffer to preserve trampoline memory */ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; +/* + * The page table is going to be used instead of page table in the trampoline + * memory. + * + * It must not be in BSS as BSS is cleared after cleanup_trampoline(). + */ +static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); + /* * Trampoline address will be printed by extract_kernel() for debugging * purposes. @@ -83,11 +91,64 @@ struct paging_config paging_prepare(void) memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); + /* + * The code below prepares page table in trampoline memory. + * + * The new page table will be used by trampoline code for switching + * from 4- to 5-level paging or vice versa. + * + * If switching is not required, the page table is unused: trampoline + * code wouldn't touch CR3. + */ + + /* + * We are not going to use the page table in trampoline memory if we + * are already in the desired paging mode. + */ + if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + goto out; + + if (paging_config.l5_required) { + /* + * For 4- to 5-level paging transition, set up current CR3 as + * the first and the only entry in a new top-level page table. + */ + trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; + } else { + unsigned long src; + + /* + * For 5- to 4-level paging transition, copy page table pointed + * by first entry in the current top-level page table as our + * new top-level page table. + * + * We cannot just point to the page table from trampoline as it + * may be above 4G. + */ + src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), + (void *)src, PAGE_SIZE); + } + +out: return paging_config; } void cleanup_trampoline(void) { + void *trampoline_pgtable; + + trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET; + + /* + * Move the top level page table out of trampoline memory, + * if it's there. + */ + if ((void *)__native_read_cr3() == trampoline_pgtable) { + memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); + native_write_cr3((unsigned long)top_pgtable); + } + /* Restore trampoline memory */ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); } -- cgit v1.2.3-55-g7522 From c100a583601d357f923c41af5434dc1f8d07890f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 2 Mar 2018 13:18:01 +0800 Subject: kdump, vmcoreinfo: Export pgtable_l5_enabled value User-space utilities examining crash-kernels need to know if the crashed kernel was in 5-level paging mode or not. So write 'pgtable_l5_enabled' to vmcoreinfo, which covers these three cases: pgtable_l5_enabled == 0 when: - Compiled with !CONFIG_X86_5LEVEL - Compiled with CONFIG_X86_5LEVEL=y while CPU has no 'la57' flag pgtable_l5_enabled != 0 when: - Compiled with CONFIG_X86_5LEVEL=y and CPU has 'la57' flag Signed-off-by: Baoquan He Acked-by: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: douly.fnst@cn.fujitsu.com Cc: dyoung@redhat.com Cc: ebiederm@xmission.com Cc: kirill.shutemov@linux.intel.com Cc: vgoyal@redhat.com Link: http://lkml.kernel.org/r/20180302051801.19594-1-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/machine_kexec_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 3b7427aa7d85..02f913cb27b5 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -350,6 +350,7 @@ void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_top_pgt); + VMCOREINFO_NUMBER(pgtable_l5_enabled); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); -- cgit v1.2.3-55-g7522 From a5b162b2ecb013ed517ab5ce90079117ada743f4 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 5 Mar 2018 11:16:41 +0300 Subject: x86/mm: Do not use paravirtualized calls in native_set_p4d() In 4-level paging mode, native_set_p4d() updates the entry in the top-level page table. With PTI, update to the top-level kernel page table requires update to the userspace copy of the table as well, using pti_set_user_pgd(). native_set_p4d() uses p4d_val() and pgd_val() to convert types between p4d_t and pgd_t. p4d_val() and pgd_val() are paravirtualized and we must not use them in native helpers, as they crash the boot in paravirtualized environments. Replace p4d_val() and pgd_val() with native_p4d_val() and native_pgd_val() in native_set_p4d(). Reported-by: Fengguang Wu Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 91f606a8fa68 ("x86/mm: Replace compile-time checks for 5-level paging with runtime-time checks") Link: http://lkml.kernel.org/r/20180305081641.4290-1-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 81dda8d1d0bd..163e01a0631d 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -224,9 +224,9 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) return; } - pgd = native_make_pgd(p4d_val(p4d)); + pgd = native_make_pgd(native_p4d_val(p4d)); pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd); - *p4dp = native_make_p4d(pgd_val(pgd)); + *p4dp = native_make_p4d(native_pgd_val(pgd)); } static inline void native_p4d_clear(p4d_t *p4d) -- cgit v1.2.3-55-g7522 From 7beebaccd5083d61e975acb581bd528b9326e7c4 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 12 Mar 2018 13:02:43 +0300 Subject: x86/boot/compressed/64: Make sure we have a 32-bit code segment When kernel starts in 64-bit mode we inherit the GDT from the bootloader. It may cause a problem if the GDT doesn't have a 32-bit code segment where we expect it to be. Load our own GDT with known segments. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180312100246.89175-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index c813cb004056..f0c3a2f7e528 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -313,6 +313,11 @@ ENTRY(startup_64) * first. */ + /* Make sure we have GDT with 32-bit code segment */ + leaq gdt(%rip), %rax + movq %rax, gdt64+2(%rip) + lgdt gdt64(%rip) + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. @@ -547,6 +552,11 @@ no_longmode: #include "../../kernel/verify_cpu.S" .data +gdt64: + .word gdt_end - gdt + .long 0 + .word 0 + .quad 0 gdt: .word gdt_end - gdt .long gdt -- cgit v1.2.3-55-g7522 From f7ff53e4702be9ac49deacd518dd243de45c9980 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 12 Mar 2018 13:02:44 +0300 Subject: x86/boot/compressed/64: Use stack from trampoline memory As the first step on using trampoline memory, let's make 32-bit code use stack there. Separate stack is required to return back from trampoline and we cannot user stack from 64-bit mode as it may be above 4G. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180312100246.89175-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index f0c3a2f7e528..12915511be61 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include #include #include +#include "pgtable.h" /* * Locally defined symbols should be marked hidden: @@ -359,6 +360,8 @@ ENTRY(startup_64) pushq %rax lretq lvl5: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp /* * cleanup_trampoline() would restore trampoline memory. @@ -513,6 +516,9 @@ compatible_mode: movl %eax, %ds movl %eax, %ss + /* Set up new stack */ + leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax -- cgit v1.2.3-55-g7522 From 0a1756bd2897951c03c1cb671bdfd40729ac2177 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 12 Mar 2018 13:02:45 +0300 Subject: x86/boot/compressed/64: Use page table in trampoline memory If a bootloader enables 64-bit mode with 4-level paging, we might need to switch over to 5-level paging. The switching requires the disabling paging. It works fine if kernel itself is loaded below 4G. But if the bootloader put the kernel above 4G (i.e. in kexec() case), we would lose control as soon as paging is disabled, because the code becomes unreachable to the CPU. To handle the situation, we need a trampoline in lower memory that would take care of switching on 5-level paging. Apart from the trampoline code itself we also need a place to store top-level page table in lower memory as we don't have a way to load 64-bit values into CR3 in 32-bit mode. We only really need 8 bytes there as we only use the very first entry of the page table. But we allocate a whole page anyway. This patch switches 32-bit code to use page table in trampoline memory. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180312100246.89175-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 47 +++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 12915511be61..959ca07a58d9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -336,23 +336,6 @@ ENTRY(startup_64) /* Save the trampoline address in RCX */ movq %rax, %rcx - /* Check if we need to enable 5-level paging */ - cmpq $0, %rdx - jz lvl5 - - /* Clear additional page table */ - leaq lvl5_pgtable(%rbx), %rdi - xorq %rax, %rax - movq $(PAGE_SIZE/8), %rcx - rep stosq - - /* - * Setup current CR3 as the first and only entry in a new top level - * page table. - */ - movq %cr3, %rdi - leaq 0x7 (%rdi), %rax - movq %rax, lvl5_pgtable(%rbx) /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS @@ -524,13 +507,31 @@ compatible_mode: btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Point CR3 to 5-level paging */ - leal lvl5_pgtable(%ebx), %eax - movl %eax, %cr3 + /* Check what paging mode we want to be in after the trampoline */ + cmpl $0, %edx + jz 1f - /* Enable PAE and LA57 mode */ + /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ movl %cr4, %eax - orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + testl $X86_CR4_LA57, %eax + jnz 3f + jmp 2f +1: + /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ + movl %cr4, %eax + testl $X86_CR4_LA57, %eax + jz 3f +2: + /* Point CR3 to the trampoline's new top level page table */ + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax + movl %eax, %cr3 +3: + /* Enable PAE and LA57 (if required) paging modes */ + movl $X86_CR4_PAE, %eax + cmpl $0, %edx + jz 1f + orl $X86_CR4_LA57, %eax +1: movl %eax, %cr4 /* Calculate address we are running at */ @@ -611,5 +612,3 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -lvl5_pgtable: - .fill PAGE_SIZE, 1, 0 -- cgit v1.2.3-55-g7522 From 194a9749c73d650c0b1dfdee04fb0bdf0a888ba8 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 12 Mar 2018 13:02:46 +0300 Subject: x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G This patch addresses a shortcoming in current boot process on machines that supports 5-level paging. If a bootloader enables 64-bit mode with 4-level paging, we might need to switch over to 5-level paging. The switching requires the disabling paging. It works fine if kernel itself is loaded below 4G. But if the bootloader put the kernel above 4G (not sure if anybody does this), we would lose control as soon as paging is disabled, because the code becomes unreachable to the CPU. This patch implements a trampoline in lower memory to handle this situation. We only need the memory for a very short time, until the main kernel image sets up own page tables. We go through the trampoline even if we don't have to: if we're already in 5-level paging mode or if we don't need to switch to it. This way the trampoline gets tested on every boot. Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180312100246.89175-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 69 +++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 959ca07a58d9..fca012baba19 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -307,11 +307,27 @@ ENTRY(startup_64) /* * At this point we are in long mode with 4-level paging enabled, - * but we want to enable 5-level paging. + * but we might want to enable 5-level paging or vice versa. * - * The problem is that we cannot do it directly. Setting LA57 in - * long mode would trigger #GP. So we need to switch off long mode - * first. + * The problem is that we cannot do it directly. Setting or clearing + * CR4.LA57 in long mode would trigger #GP. So we need to switch off + * long mode and paging first. + * + * We also need a trampoline in lower memory to switch over from + * 4- to 5-level paging for cases when the bootloader puts the kernel + * above 4G, but didn't enable 5-level paging for us. + * + * The same trampoline can be used to switch from 5- to 4-level paging + * mode, like when starting 4-level paging kernel via kexec() when + * original kernel worked in 5-level paging mode. + * + * For the trampoline, we need the top page table to reside in lower + * memory as we don't have a way to load 64-bit values into CR3 in + * 32-bit mode. + * + * We go though the trampoline even if we don't have to: if we're + * already in a desired paging mode. This way the trampoline code gets + * tested on every boot. */ /* Make sure we have GDT with 32-bit code segment */ @@ -336,13 +352,18 @@ ENTRY(startup_64) /* Save the trampoline address in RCX */ movq %rax, %rcx + /* + * Load the address of trampoline_return() into RDI. + * It will be used by the trampoline to return to the main code. + */ + leaq trampoline_return(%rip), %rdi /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS - leaq compatible_mode(%rip), %rax + leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax pushq %rax lretq -lvl5: +trampoline_return: /* Restore the stack, the 32-bit trampoline uses its own stack */ leaq boot_stack_end(%rbx), %rsp @@ -492,8 +513,14 @@ relocated: jmp *%rax .code32 +/* + * This is the 32-bit trampoline that will be copied over to low memory. + * + * RDI contains the return address (might be above 4G). + * ECX contains the base address of the trampoline memory. + * Non zero RDX on return means we need to enable 5-level paging. + */ ENTRY(trampoline_32bit_src) -compatible_mode: /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds @@ -534,24 +561,34 @@ compatible_mode: 1: movl %eax, %cr4 - /* Calculate address we are running at */ - call 1f -1: popl %edi - subl $1b, %edi + /* Calculate address of paging_enabled() once we are executing in the trampoline */ + leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - /* Prepare stack for far return to Long Mode */ + /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS - leal lvl5(%edi), %eax - push %eax + pushl %eax - /* Enable paging back */ + /* Enable paging again */ movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0 lret + .code64 +paging_enabled: + /* Return from the trampoline */ + jmp *%rdi + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE + + .code32 no_longmode: - /* This isn't an x86-64 CPU so hang */ + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b -- cgit v1.2.3-55-g7522 From cb06d8e3d020c30fe10ae711c925a5319ab82c88 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 5 Mar 2018 19:25:50 +0300 Subject: x86/tme: Detect if TME and MKTME is activated by BIOS IA32_TME_ACTIVATE MSR (0x982) can be used to check if BIOS has enabled TME and MKTME. It includes which encryption policy/algorithm is selected for TME or available for MKTME. For MKTME, the MSR also enumerates how many KeyIDs are available. We would need to exclude KeyID bits from physical address bits. detect_tme() would adjust cpuinfo_x86::x86_phys_bits accordingly. We have to do this even if we are not going to use KeyID bits ourself. VM guests still have to know that these bits are not usable for physical address. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Cc: Kai Huang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180305162610.37510-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 90 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 4aa9fd379390..b862067bb33c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -510,6 +510,93 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) } } +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuation is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * Exclude KeyID bits from physical address bits. + * + * We have to do this even if we are not going to use KeyID bits + * ourself. VM guests still have to know that these bits are not usable + * for physical address. + */ + c->x86_phys_bits -= keyid_bits; +} + static void init_intel_energy_perf(struct cpuinfo_x86 *c) { u64 epb; @@ -680,6 +767,9 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme(c); + init_intel_energy_perf(c); init_intel_misc_features(c); -- cgit v1.2.3-55-g7522 From be7825c19b4866ddc7b1431740b69ede2eeb93c1 Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 5 Mar 2018 19:25:52 +0300 Subject: x86/pconfig: Detect PCONFIG targets Intel PCONFIG targets are enumerated via new CPUID leaf 0x1b. This patch detects all supported targets of PCONFIG and implements helper to check if the target is supported. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Cc: Kai Huang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180305162610.37510-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel_pconfig.h | 15 +++++++ arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/intel_pconfig.c | 82 ++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/intel_pconfig.h create mode 100644 arch/x86/kernel/cpu/intel_pconfig.c (limited to 'arch') diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h new file mode 100644 index 000000000000..fb7a37c3798b --- /dev/null +++ b/arch/x86/include/asm/intel_pconfig.h @@ -0,0 +1,15 @@ +#ifndef _ASM_X86_INTEL_PCONFIG_H +#define _ASM_X86_INTEL_PCONFIG_H + +#include +#include + +enum pconfig_target { + INVALID_TARGET = 0, + MKTME_TARGET = 1, + PCONFIG_TARGET_NR +}; + +int pconfig_target_supported(enum pconfig_target target); + +#endif /* _ASM_X86_INTEL_PCONFIG_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 570e8bb1f386..a66229f51b12 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c new file mode 100644 index 000000000000..0771a905b286 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pconfig.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel PCONFIG instruction support. + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Kirill A. Shutemov + */ + +#include +#include + +#define PCONFIG_CPUID 0x1b + +#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1) + +/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */ +enum { + PCONFIG_CPUID_SUBLEAF_INVALID = 0, + PCONFIG_CPUID_SUBLEAF_TARGETID = 1, +}; + +/* Bitmask of supported targets */ +static u64 targets_supported __read_mostly; + +int pconfig_target_supported(enum pconfig_target target) +{ + /* + * We would need to re-think the implementation once we get > 64 + * PCONFIG targets. Spec allows up to 2^32 targets. + */ + BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64); + + if (WARN_ON_ONCE(target >= 64)) + return 0; + return targets_supported & (1ULL << target); +} + +static int __init intel_pconfig_init(void) +{ + int subleaf; + + if (!boot_cpu_has(X86_FEATURE_PCONFIG)) + return 0; + + /* + * Scan subleafs of PCONFIG CPUID leaf. + * + * Subleafs of the same type need not to be consecutive. + * + * Stop on the first invalid subleaf type. All subleafs after the first + * invalid are invalid too. + */ + for (subleaf = 0; subleaf < INT_MAX; subleaf++) { + struct cpuid_regs regs; + + cpuid_count(PCONFIG_CPUID, subleaf, + ®s.eax, ®s.ebx, ®s.ecx, ®s.edx); + + switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) { + case PCONFIG_CPUID_SUBLEAF_INVALID: + /* Stop on the first invalid subleaf */ + goto out; + case PCONFIG_CPUID_SUBLEAF_TARGETID: + /* Mark supported PCONFIG targets */ + if (regs.ebx < 64) + targets_supported |= (1ULL << regs.ebx); + if (regs.ecx < 64) + targets_supported |= (1ULL << regs.ecx); + if (regs.edx < 64) + targets_supported |= (1ULL << regs.edx); + break; + default: + /* Unknown CPUID.PCONFIG subleaf: ignore */ + break; + } + } +out: + return 0; +} +arch_initcall(intel_pconfig_init); -- cgit v1.2.3-55-g7522 From 24c517856af6511be1339dd55edd131160e37aac Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Mon, 5 Mar 2018 19:25:53 +0300 Subject: x86/pconfig: Provide defines and helper to run MKTME_KEY_PROG leaf MKTME_KEY_PROG allows to manipulate MKTME keys in the CPU. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Cc: Kai Huang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180305162610.37510-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel_pconfig.h | 50 ++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h index fb7a37c3798b..3cb002b1d0f9 100644 --- a/arch/x86/include/asm/intel_pconfig.h +++ b/arch/x86/include/asm/intel_pconfig.h @@ -12,4 +12,54 @@ enum pconfig_target { int pconfig_target_supported(enum pconfig_target target); +enum pconfig_leaf { + MKTME_KEY_PROGRAM = 0, + PCONFIG_LEAF_INVALID, +}; + +#define PCONFIG ".byte 0x0f, 0x01, 0xc5" + +/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */ + +/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */ +#define MKTME_KEYID_SET_KEY_DIRECT 0 +#define MKTME_KEYID_SET_KEY_RANDOM 1 +#define MKTME_KEYID_CLEAR_KEY 2 +#define MKTME_KEYID_NO_ENCRYPT 3 + +/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */ +#define MKTME_AES_XTS_128 (1 << 8) + +/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */ +#define MKTME_PROG_SUCCESS 0 +#define MKTME_INVALID_PROG_CMD 1 +#define MKTME_ENTROPY_ERROR 2 +#define MKTME_INVALID_KEYID 3 +#define MKTME_INVALID_ENC_ALG 4 +#define MKTME_DEVICE_BUSY 5 + +/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */ +struct mktme_key_program { + u16 keyid; + u32 keyid_ctrl; + u8 __rsvd[58]; + u8 key_field_1[64]; + u8 key_field_2[64]; +} __packed __aligned(256); + +static inline int mktme_key_program(struct mktme_key_program *key_program) +{ + unsigned long rax = MKTME_KEY_PROGRAM; + + if (!pconfig_target_supported(MKTME_TARGET)) + return -ENXIO; + + asm volatile(PCONFIG + : "=a" (rax), "=b" (key_program) + : "0" (rax), "1" (key_program) + : "memory", "cc"); + + return rax; +} + #endif /* _ASM_X86_INTEL_PCONFIG_H */ -- cgit v1.2.3-55-g7522 From 50beba07a0e42ebd4454adc97515a2a2a969645b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 20 Feb 2018 20:05:04 +0200 Subject: ACPI, x86/boot: Split out acpi_generic_reduce_hw_init() and export This is a preparation patch to allow override the hardware reduced initialization on ACPI enabled platforms. No functional change intended. Signed-off-by: Andy Shevchenko Reviewed-by: Rafael J. Wysocki Cc: Eric Biederman Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Link: http://lkml.kernel.org/r/20180220180506.65523-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/acpi.h | 4 ++++ arch/x86/kernel/acpi/boot.c | 22 +++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 6609dd7289b5..a303d7b7d763 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -140,6 +140,8 @@ static inline u64 acpi_arch_get_root_pointer(void) return x86_init.acpi.get_root_pointer(); } +void acpi_generic_reduced_hw_init(void); + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 @@ -149,6 +151,8 @@ static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } +static inline void acpi_generic_reduced_hw_init(void) { } + #endif /* !CONFIG_ACPI */ #define ARCH_HAS_POWER_INIT 1 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2aa92094b59d..baa084ecffdb 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1376,17 +1376,21 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) * * We initialize the Hardware-reduced ACPI model here: */ +void __init acpi_generic_reduced_hw_init(void) +{ + /* + * Override x86_init functions and bypass legacy PIC in + * hardware reduced ACPI mode. + */ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + legacy_pic = &null_legacy_pic; +} + static void __init acpi_reduced_hw_init(void) { - if (acpi_gbl_reduced_hardware) { - /* - * Override x86_init functions and bypass legacy pic - * in Hardware-reduced ACPI mode - */ - x86_init.timers.timer_init = x86_init_noop; - x86_init.irqs.pre_vector_init = x86_init_noop; - legacy_pic = &null_legacy_pic; - } + if (acpi_gbl_reduced_hardware) + acpi_generic_reduced_hw_init(); } /* -- cgit v1.2.3-55-g7522 From 81b53e5ff21e09b42525cfa08f2b0af2b8c5f465 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 20 Feb 2018 20:05:05 +0200 Subject: ACPI, x86/boot: Introduce the ->reduced_hw_early_init() ACPI callback Some ACPI hardware reduced platforms need to initialize certain devices defined by the ACPI hardware specification even though in principle those devices should not be present in an ACPI hardware reduced platform. To allow that to happen, make it possible to override the generic x86_init callbacks and provide a custom legacy_pic value, add a new ->reduced_hw_early_init() callback to struct x86_init_acpi and make acpi_reduced_hw_init() use it. Signed-off-by: Andy Shevchenko Reviewed-by: Rafael J. Wysocki Cc: Eric Biederman Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Link: http://lkml.kernel.org/r/20180220180506.65523-2-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/x86_init.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2e2c34d2bb00..5bd45a8f5ae3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -133,9 +133,11 @@ struct x86_hyper_init { /** * struct x86_init_acpi - x86 ACPI init functions * @get_root_pointer: get RSDP address + * @reduced_hw_early_init: hardware reduced platform early init */ struct x86_init_acpi { u64 (*get_root_pointer)(void); + void (*reduced_hw_early_init)(void); }; /** diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index baa084ecffdb..7a37d9357bc4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1390,7 +1390,7 @@ void __init acpi_generic_reduced_hw_init(void) static void __init acpi_reduced_hw_init(void) { if (acpi_gbl_reduced_hardware) - acpi_generic_reduced_hw_init(); + x86_init.acpi.reduced_hw_early_init(); } /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index b8cff22a8785..ac67ccffeef0 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -95,6 +96,7 @@ struct x86_init_ops x86_init __initdata = { .acpi = { .get_root_pointer = u64_x86_init_noop, + .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, }; -- cgit v1.2.3-55-g7522 From 02428742639bc3300c8c527b054d0ec0bdf5571d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 20 Feb 2018 20:05:06 +0200 Subject: x86/platform/intel-mid: Add special handling for ACPI HW reduced platforms When switching to ACPI HW reduced platforms we still want to initialize timers. Override x86_init.acpi.reduced_hw_init to achieve that. Signed-off-by: Andy Shevchenko Reviewed-by: Rafael J. Wysocki Cc: Eric Biederman Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Link: http://lkml.kernel.org/r/20180220180506.65523-3-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/intel-mid.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 2c67bae6bb53..c556f1e8936e 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -199,6 +199,12 @@ void __init x86_intel_mid_early_setup(void) legacy_pic = &null_legacy_pic; + /* + * Do nothing for now as everything needed done in + * x86_intel_mid_early_setup() below. + */ + x86_init.acpi.reduced_hw_early_init = x86_init_noop; + pm_power_off = intel_mid_power_off; machine_ops.emergency_restart = intel_mid_reboot; -- cgit v1.2.3-55-g7522 From 565977a3d929fc4427769117a8ac976ec16776d5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 14 Mar 2018 14:59:32 -0600 Subject: x86/mm: Remove pointless checks in vmalloc_fault vmalloc_fault() sets user's pgd or p4d from the kernel page table. Once it's set, all tables underneath are identical. There is no point of following the same page table with two separate pointers and make sure they see the same with BUG(). Remove the pointless checks in vmalloc_fault(). Also rename the kernel pgd/p4d pointers to pgd_k/p4d_k so that their names are consistent in the file. Suggested-by: Andy Lutomirski Signed-off-by: Toshi Kani Signed-off-by: Thomas Gleixner Cc: linux-mm@kvack.org Cc: Borislav Petkov Cc: Gratian Crisan Link: https://lkml.kernel.org/r/20180314205932.7193-1-toshi.kani@hpe.com --- arch/x86/mm/fault.c | 56 ++++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 84d702a71afe..70c3b1c43676 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -417,11 +417,11 @@ void vmalloc_sync_all(void) */ static noinline int vmalloc_fault(unsigned long address) { - pgd_t *pgd, *pgd_ref; - p4d_t *p4d, *p4d_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; + pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) @@ -435,73 +435,51 @@ static noinline int vmalloc_fault(unsigned long address) * case just flush: */ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) + pgd_k = pgd_offset_k(address); + if (pgd_none(*pgd_k)) return -1; if (pgtable_l5_enabled) { if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_ref); + set_pgd(pgd, *pgd_k); arch_flush_lazy_mmu_mode(); } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); } } /* With 4-level paging, copying happens on the p4d level. */ p4d = p4d_offset(pgd, address); - p4d_ref = p4d_offset(pgd_ref, address); - if (p4d_none(*p4d_ref)) + p4d_k = p4d_offset(pgd_k, address); + if (p4d_none(*p4d_k)) return -1; if (p4d_none(*p4d) && !pgtable_l5_enabled) { - set_p4d(p4d, *p4d_ref); + set_p4d(p4d, *p4d_k); arch_flush_lazy_mmu_mode(); } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); } - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); pud = pud_offset(p4d, address); - pud_ref = pud_offset(p4d_ref, address); - if (pud_none(*pud_ref)) + if (pud_none(*pud)) return -1; - if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref)) - BUG(); - if (pud_large(*pud)) return 0; pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) + if (pmd_none(*pmd)) return -1; - if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref)) - BUG(); - if (pmd_large(*pmd)) return 0; - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); + if (!pte_present(*pte)) + return -1; return 0; } -- cgit v1.2.3-55-g7522 From fc5d1073cae299de4517755a910df4f12a6a438f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 26 Mar 2018 23:27:21 -0700 Subject: x86/mm/32: Remove unused node_memmap_size_bytes() & CONFIG_NEED_NODE_MEMMAP_SIZE logic node_memmap_size_bytes() has been unused since the v3.9 kernel, so remove it. Signed-off-by: David Rientjes Cc: Dave Hansen Cc: Laura Abbott Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Fixes: f03574f2d5b2 ("x86-32, mm: Rip out x86_32 NUMA remapping code") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803262325540.256524@chino.kir.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ---- arch/x86/mm/numa_32.c | 11 ----------- include/linux/mmzone.h | 5 ----- mm/sparse.c | 22 ---------------------- 4 files changed, 42 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 18233e459bff..739aff253d17 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1608,10 +1608,6 @@ config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM -config NEED_NODE_MEMMAP_SIZE - def_bool y - depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index aca6295350f3..e8a4a09e20f1 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -60,17 +60,6 @@ void memory_present(int nid, unsigned long start, unsigned long end) } printk(KERN_CONT "\n"); } - -unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long nr_pages = end_pfn - start_pfn; - - if (!nr_pages) - return 0; - - return (nr_pages + 1) * sizeof(struct page); -} #endif extern unsigned long highend_pfn, highstart_pfn; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7522a6987595..a2db4576e499 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -816,10 +816,6 @@ int local_memory_node(int node_id); static inline int local_memory_node(int node_id) { return node_id; }; #endif -#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE -unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); -#endif - /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ @@ -1289,7 +1285,6 @@ struct mminit_pfnnid_cache { #endif void memory_present(int nid, unsigned long start, unsigned long end); -unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we diff --git a/mm/sparse.c b/mm/sparse.c index 7af5e7a92528..79b26f98d793 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -235,28 +235,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) } } -/* - * Only used by the i386 NUMA architecures, but relatively - * generic code. - */ -unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long pfn; - unsigned long nr_pages = 0; - - mminit_validate_memmodel_limits(&start_pfn, &end_pfn); - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - if (nid != early_pfn_to_nid(pfn)) - continue; - - if (pfn_present(pfn)) - nr_pages += PAGES_PER_SECTION; - } - - return nr_pages * sizeof(struct page); -} - /* * Subtle, we encode the real pfn into the mem_map such that * the identity pfn - section_mem_map will return the actual -- cgit v1.2.3-55-g7522 From 547edaca247abf910e32f0cd883ba83b8fc6d0ed Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Thu, 15 Mar 2018 16:49:06 +0300 Subject: x86/mm: Update comment in detect_tme() regarding x86_phys_bits As Kai pointed out, the primary reason for adjusting x86_phys_bits is to reflect that the the address space is reduced and not the ability to communicate the available physical address space to virtual machines. Suggested-by: Kai Huang Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Dave Hansen Cc: linux-mm@kvack.org Link: https://lkml.kernel.org/r/20180315134907.9311-2-kirill.shutemov@linux.intel.com --- arch/x86/kernel/cpu/intel.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3f8d7a3b6447..6106d11ceb6b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -587,11 +587,8 @@ detect_keyid_bits: } /* - * Exclude KeyID bits from physical address bits. - * - * We have to do this even if we are not going to use KeyID bits - * ourself. VM guests still have to know that these bits are not usable - * for physical address. + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. */ c->x86_phys_bits -= keyid_bits; } -- cgit v1.2.3-55-g7522 From 07344b15a994acadbe439aa4e75127ed1ccca099 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 27 Mar 2018 17:07:11 -0500 Subject: x86/boot: Fix SEV boot failure from change to __PHYSICAL_MASK_SHIFT In arch/x86/boot/compressed/kaslr_64.c, CONFIG_AMD_MEM_ENCRYPT support was initially #undef'd to support SME with minimal effort. When support for SEV was added, the #undef remained and some minimal support for setting the encryption bit was added for building identity mapped pagetable entries. Commit b83ce5ee9147 ("x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52") changed __PHYSICAL_MASK_SHIFT from 46 to 52 in support of 5-level paging. This change resulted in SEV guests failing to boot because the encryption bit was no longer being automatically masked out. The compressed boot path now requires sme_me_mask to be defined in order for the pagetable functions, such as pud_present(), to properly mask out the encryption bit (currently bit 47) when evaluating pagetable entries. Add an sme_me_mask variable in arch/x86/boot/compressed/mem_encrypt.S, which is set when SEV is active, delete the #undef CONFIG_AMD_MEM_ENCRYPT from arch/x86/boot/compressed/kaslr_64.c and use sme_me_mask when building the identify mapped pagetable entries. Fixes: b83ce5ee9147 ("x86/mm/64: Make __PHYSICAL_MASK_SHIFT always 52") Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brijesh Singh Cc: "Kirill A. Shutemov" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180327220711.8702.55842.stgit@tlendack-t1.amdoffice.net --- arch/x86/boot/compressed/kaslr_64.c | 14 ++++---------- arch/x86/boot/compressed/mem_encrypt.S | 17 ++++++++++------- arch/x86/boot/compressed/misc.h | 2 +- 3 files changed, 15 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c index b5e5e02f8cde..522d11431433 100644 --- a/arch/x86/boot/compressed/kaslr_64.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -16,13 +16,6 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) -/* - * The pgtable.h and mm/ident_map.c includes make use of the SME related - * information which is not used in the compressed image support. Un-define - * the SME support to avoid any compile and link errors. - */ -#undef CONFIG_AMD_MEM_ENCRYPT - /* No PAGE_TABLE_ISOLATION support needed either: */ #undef CONFIG_PAGE_TABLE_ISOLATION @@ -85,13 +78,14 @@ static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { - unsigned long sev_me_mask = get_sev_encryption_mask(); + /* If running as an SEV guest, the encryption mask is required. */ + set_sev_encryption_mask(); /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 54f5f6625a73..eaa843a52907 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -88,9 +88,7 @@ ENTRY(get_sev_encryption_bit) ENDPROC(get_sev_encryption_bit) .code64 -ENTRY(get_sev_encryption_mask) - xor %rax, %rax - +ENTRY(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp push %rdx @@ -101,9 +99,7 @@ ENTRY(get_sev_encryption_mask) testl %eax, %eax jz .Lno_sev_mask - xor %rdx, %rdx - bts %rax, %rdx /* Create the encryption mask */ - mov %rdx, %rax /* ... and return it */ + bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ .Lno_sev_mask: movq %rbp, %rsp /* Restore original stack pointer */ @@ -112,9 +108,16 @@ ENTRY(get_sev_encryption_mask) pop %rbp #endif + xor %rax, %rax ret -ENDPROC(get_sev_encryption_mask) +ENDPROC(set_sev_encryption_mask) .data enc_bit: .int 0xffffffff + +#ifdef CONFIG_AMD_MEM_ENCRYPT + .balign 8 +GLOBAL(sme_me_mask) + .quad 0 +#endif diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 4d369c308ed7..9e11be4cae19 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -114,6 +114,6 @@ static inline void console_init(void) { } #endif -unsigned long get_sev_encryption_mask(void); +void set_sev_encryption_mask(void); #endif -- cgit v1.2.3-55-g7522 From eaeb8e76cd5751e805f6e4a3fcec91d283e3b0c2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 13 Mar 2018 15:47:09 +0000 Subject: x86/cpu/tme: Fix spelling: "configuation" -> "configuration" Trivial fix to spelling mistake in the pr_err_once() error message text. Signed-off-by: Colin Ian King Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20180313154709.1015-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 6106d11ceb6b..b9693b80fc21 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -540,7 +540,7 @@ static void detect_tme(struct cpuinfo_x86 *c) if (mktme_status != MKTME_UNINITIALIZED) { if (tme_activate != tme_activate_cpu0) { /* Broken BIOS? */ - pr_err_once("x86/tme: configuation is inconsistent between CPUs\n"); + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); pr_err_once("x86/tme: MKTME is not usable\n"); mktme_status = MKTME_DISABLED; -- cgit v1.2.3-55-g7522