diff options
Diffstat (limited to 'arch/x86/mm')
43 files changed, 1802 insertions, 269 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f8..7ba7f3d7f477 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,13 @@ -# Kernel does not boot with instrumentation of tlb.c. -KCOV_INSTRUMENT_tlb.o := n +# SPDX-License-Identifier: GPL-2.0 +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n + +KASAN_SANITIZE_mem_encrypt.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o = -pg +endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o setup_nx.o tlb.o @@ -39,3 +47,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 91f501b2da3b..048c761d97b0 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * AMD NUMA support. * Discover the memory map and associated nodes. diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0470826d2bdc..5e3ac6fe6c9e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -13,12 +13,12 @@ */ #include <linux/debugfs.h> +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/seq_file.h> -#include <asm/kasan.h> #include <asm/pgtable.h> /* @@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) { pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = - { "cr3", "pgd", "pud", "pmd", "pte" }; + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; if (!pgprot_val(prot)) { /* Not present */ @@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); /* Bit 7 has a different meaning on level 3 vs 4 */ - if (level <= 3 && pr & _PAGE_PSE) + if (level <= 4 && pr & _PAGE_PSE) pt_dump_cont_printf(m, dmsg, "PSE "); else pt_dump_cont_printf(m, dmsg, " "); - if ((level == 4 && pr & _PAGE_PAT) || - ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + if ((level == 5 && pr & _PAGE_PAT) || + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); @@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) */ static unsigned long normalize_addr(unsigned long u) { -#ifdef CONFIG_X86_64 - return (signed long)(u << 16) >> 16; -#else - return u; -#endif + int shift; + if (!IS_ENABLED(CONFIG_X86_64)) + return u; + + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + return (signed long)(u << shift) >> shift; } /* @@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), 5); start++; } } +#ifdef CONFIG_KASAN + +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_zero_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + if (__pa(pt) == __pa(kasan_zero_pmd) || +#ifdef CONFIG_X86_5LEVEL + __pa(pt) == __pa(kasan_zero_p4d) || +#endif + __pa(pt) == __pa(kasan_zero_pud)) { + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); + note_page(m, st, __pgprot(prot), 5); + return true; + } + return false; +} +#else +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + return false; +} +#endif #if PTRS_PER_PMD > 1 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; - pmd_t *start; + pmd_t *start, *pmd_start; pgprotval_t prot; - start = (pmd_t *)pud_page_vaddr(addr); + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { if (pmd_large(*start) || !pmd_present(*start)) { prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 3); - } else { + note_page(m, st, __pgprot(prot), 4); + } else if (!kasan_page_table(m, st, pmd_start)) { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 4); start++; } } @@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, #if PTRS_PER_PUD > 1 -/* - * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y - * KASAN fills page tables with the same values. Since there is no - * point in checking page table more than once we just skip repeated - * entries. This saves us dozens of seconds during boot. - */ -static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) -{ - return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); -} - static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; - pud_t *start; + pud_t *start, *pud_start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *)p4d_page_vaddr(addr); + pud_start = start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); - if (!pud_none(*start) && - !pud_already_checked(prev_pud, start, st->check_wx)) { + if (!pud_none(*start)) { if (pud_large(*start) || !pud_present(*start)) { prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 2); - } else { + note_page(m, st, __pgprot(prot), 3); + } else if (!kasan_page_table(m, st, pud_start)) { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 3); prev_pud = start; start++; @@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; - p4d_t *start; + p4d_t *start, *p4d_start; pgprotval_t prot; - start = (p4d_t *)pgd_page_vaddr(addr); + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); @@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, if (p4d_large(*start) || !p4d_present(*start)) { prot = p4d_flags(*start); note_page(m, st, __pgprot(prot), 2); - } else { + } else if (!kasan_page_table(m, st, p4d_start)) { walk_pud_level(m, st, *start, P + i * P4D_LEVEL_MULT); } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..c3521e2be396 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,6 +2,7 @@ #include <linux/uaccess.h> #include <linux/sched/debug.h> +#include <asm/fpu/internal.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -36,6 +37,71 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + __copy_kernel_to_fpregs(&init_fpstate, -1); + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -142,7 +208,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2a1fa10c6a98..b0ff378650a9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. @@ -192,8 +193,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -209,7 +209,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -219,13 +219,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -240,7 +239,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -396,14 +395,18 @@ static void dump_pagetable(unsigned long address) pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", pgd_val(*pgd)); + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde /* * We must not directly access the pte in the highpte @@ -415,9 +418,9 @@ static void dump_pagetable(unsigned long address) goto out; pte = pte_offset_kernel(pmd, address); - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); } #else /* CONFIG_X86_64: */ @@ -565,7 +568,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); + pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; @@ -574,7 +577,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(p4d)) goto bad; - printk("P4D %lx ", p4d_val(*p4d)); + pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_large(*p4d)) goto out; @@ -582,7 +585,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); + pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; @@ -590,7 +593,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); + pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; @@ -598,12 +601,12 @@ static void dump_pagetable(unsigned long address) if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); + pr_cont("PTE %lx", pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); return; bad: - printk("BAD\n"); + pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ @@ -758,8 +761,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -784,7 +785,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -802,7 +803,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - register void *__sp asm("rsp"); unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -817,7 +817,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, asm volatile ("movq %[stack], %%rsp\n\t" "call handle_stack_overflow\n\t" "1: jmp 1b" - : "+r" (__sp) + : ASM_CALL_CONSTRAINT : "D" ("kernel stack overflow (page fault)"), "S" (regs), "d" (address), [stack] "rm" (stack)); @@ -893,8 +893,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -942,7 +941,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -955,9 +954,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -965,6 +964,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -972,7 +975,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -1015,7 +1019,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1042,13 +1046,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1072,9 +1075,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1254,10 +1257,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * This function must have noinline because both callers - * {,trace_}do_page_fault() have notrace on. Having this an actual function - * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, @@ -1268,6 +1267,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1441,7 +1441,17 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. + * + * Note that handle_userfault() may also release and reacquire mmap_sem + * (and not return with VM_FAULT_RETRY), when returning to userland to + * repeat the page fault later with a VM_FAULT_NOPAGE retval + * (potentially after handling any pending signal during the return to + * userland). The return to userland is identified whenever + * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. + * Thus we have to be careful about not touching vma after handling the + * fault, so we read the pkey beforehand. */ + pkey = vma_pkey(vma); fault = handle_mm_fault(vma, address, flags); major |= fault & VM_FAULT_MAJOR; @@ -1470,7 +1480,7 @@ good_area: up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } @@ -1490,27 +1500,6 @@ good_area: } NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - /* - * We must have this function tagged with __kprobes, notrace and call - * read_cr2() before calling anything else. To avoid calling any kind - * of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contain all sorts of tracepoints. - */ - - prev_state = exception_enter(); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); -} -NOKPROBE_SYMBOL(do_page_fault); - -#ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) @@ -1521,22 +1510,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, trace_page_fault_kernel(address, regs, error_code); } +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ dotraplinkage void notrace -trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - /* - * The exception_enter and tracepoint processing could - * trigger another page faults (user space callchain - * reading) and destroy the original cr2 value, so read - * the faulting address now. - */ - unsigned long address = read_cr2(); + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(address, regs, error_code); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -NOKPROBE_SYMBOL(trace_do_page_fault); -#endif /* CONFIG_TRACING */ +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 2824607df108..8ae0000cbdb3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * IA-32 Huge TLB Page Support for Kernel. * @@ -18,6 +19,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/elf.h> +#include <asm/mpx.h> #if 0 /* This is just for testing */ struct page * @@ -85,25 +87,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ info.high_limit = in_compat_syscall() ? - tasksize_32bit() : tasksize_64bit(); + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, + unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; - unsigned long addr; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -118,7 +133,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } @@ -135,6 +150,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; + + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index adab1595f4bd..ab33a32df2a8 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Helper routines for building identity mapping page tables. This is * included by both the compressed kernel and the regular kernel. @@ -51,7 +52,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (!pmd) return -ENOMEM; ident_pmd_init(info, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); } return 0; @@ -79,7 +80,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (!pud) return -ENOMEM; ident_pud_init(info, pud, addr, next); - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } return 0; @@ -93,6 +94,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long next; int result; + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; @@ -116,14 +121,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (result) return result; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* * With p4d folded, pgd is equal to p4d. * The pgd entry has to point to the pud page table in this case. */ pud_t *pud = pud_offset(p4d, 0); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); } } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bf3f1065d6ad..af5c1ed21d43 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -19,6 +19,7 @@ #include <asm/microcode.h> #include <asm/kaslr.h> #include <asm/hypervisor.h> +#include <asm/cpufeature.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -193,6 +194,38 @@ static void __init probe_page_size_mask(void) } } +static void setup_pcid(void) +{ +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_PCID)) { + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- + * the trampoline code can't handle CR4.PCIDE and + * it wouldn't do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually + * cause the bits in question to remain set all the + * way through the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE + * manually in start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); + } + } +#endif +} + #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -592,6 +625,7 @@ void __init init_mem_mapping(void) unsigned long end; probe_page_size_mask(); + setup_pcid(); #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; @@ -815,7 +849,7 @@ void __init zone_sizes_init(void) DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, - .state = 0, + .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 136422d7d539..048fbe8fc274 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -761,7 +761,7 @@ void __init paging_init(void) * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need * updating. */ -static void update_end_of_memory_vars(u64 start, u64 size) +static void update_end_of_memory_vars(u64 start, u64 size) { unsigned long end_pfn = PFN_UP(start + size); @@ -772,22 +772,30 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int add_pages(int nid, unsigned long start_pfn, + unsigned long nr_pages, bool want_memblock) { - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ - update_end_of_memory_vars(start, size); + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); return ret; } + +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + init_memory_mapping(start, start + size); + + return add_pages(nid, start_pfn, nr_pages, want_memblock); +} EXPORT_SYMBOL_GPL(arch_add_memory); #define PAGE_INUSE 0xFD diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4c1b5fd0c7ad..34f0e1847dd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -13,6 +13,8 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> +#include <linux/mem_encrypt.h> +#include <linux/efi.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -21,6 +23,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/pat.h> +#include <asm/setup.h> #include "physaddr.h" @@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, } /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_ISA_range(phys_addr, last_addr)) - return (__force void __iomem *)phys_to_virt(phys_addr); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ pfn = phys_addr >> PAGE_SHIFT; @@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr) return; /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. */ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); return; + } addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); @@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) unsigned long offset = phys & ~PAGE_MASK; void *vaddr; - /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ - if (page_is_ram(start >> PAGE_SHIFT)) - return __va(phys); + /* memremap() maps if RAM, otherwise falls back to ioremap() */ + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); - vaddr = ioremap_cache(start, PAGE_SIZE); - /* Only add the offset on success and return NULL if the ioremap() failed: */ + /* Only add the offset on success and return NULL if memremap() failed */ if (vaddr) vaddr += offset; @@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { - if (page_is_ram(phys >> PAGE_SHIFT)) - return; + memunmap((void *)((unsigned long)addr & PAGE_MASK)); +} + +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. And since the encryption key can + * change across reboots, persistent memory should also be mapped + * decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + int is_pmem; + + /* + * Check if the address is part of a persistent memory region. + * This check covers areas added by E820, EFI and ACPI. + */ + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); + if (is_pmem != REGION_DISJOINT) + return true; + + /* + * Check if the non-volatile attribute is set for an EFI + * reserved area. + */ + if (efi_enabled(EFI_BOOT)) { + switch (efi_mem_type(phys_addr)) { + case EFI_RESERVED_TYPE: + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) + return true; + break; + default: + break; + } + } + + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + case E820_TYPE_PRAM: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + + paddr_next = data->next; + len = data->len; + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + + paddr_next = data->next; + len = data->len; + + early_memunmap(data, sizeof(*data)); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!sme_active()) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + return false; + + return true; +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + if (!sme_active()) + return prot; + + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + prot = pgprot_decrypted(prot); + else + prot = pgprot_encrypted(prot); + + return prot; +} + +bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) +{ + return arch_memremap_can_ram_remap(phys_addr, size, 0); +} + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; - iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 02c9d7553409..8f5be3eb40dd 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt #include <linux/bootmem.h> @@ -11,8 +12,8 @@ #include <asm/e820/types.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/pgtable.h> -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_MAX_ENTRIES]; static int __init map_range(struct range *range) @@ -87,7 +88,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -153,7 +154,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index af599167fe3c..879ef930e2c2 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file implements KASLR memory randomization for x86_64. It randomizes * the virtual address space of kernel memory regions (physical memory diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index dab41876cdd5..872ec4159a68 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/interrupt.h> #include <linux/kdebug.h> #include <linux/kmemcheck.h> diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h index 0efc2e8d0a20..39f80d7a874d 100644 --- a/arch/x86/mm/kmemcheck/error.h +++ b/arch/x86/mm/kmemcheck/error.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H #define ARCH__X86__MM__KMEMCHECK__ERROR_H diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 324aa3f07237..df8109ddf7fe 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include "opcode.h" diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h index 6956aad66b5b..51a1ce94c24a 100644 --- a/arch/x86/mm/kmemcheck/opcode.h +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H #define ARCH__X86__MM__KMEMCHECK__OPCODE_H diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c index 4ead26eeaf96..8a03be90272a 100644 --- a/arch/x86/mm/kmemcheck/pte.c +++ b/arch/x86/mm/kmemcheck/pte.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <asm/pgtable.h> diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h index 9f5966456492..b595612382c2 100644 --- a/arch/x86/mm/kmemcheck/pte.h +++ b/arch/x86/mm/kmemcheck/pte.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__PTE_H #define ARCH__X86__MM__KMEMCHECK__PTE_H diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index aef7140c0063..7ce0be1f99eb 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bug.h> #include <linux/kernel.h> diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h index 8fed4fe11f95..8d759aae453d 100644 --- a/arch/x86/mm/kmemcheck/selftest.h +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H #define ARCH_X86_MM_KMEMCHECK_SELFTEST_H diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index ff0b2f70fbcb..49768dc18664 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H #define ARCH__X86__MM__KMEMCHECK__SHADOW_H diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index afc47f5c9531..c21c2ed04612 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Support for MMIO probes. * Benfit many code from kprobes * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 000000000000..16c5f37933a2 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,595 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/sections.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +u64 sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL_GPL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + local_flush_tlb(); + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + __native_flush_tlb(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void) +{ + if (!sme_me_mask) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); + + pr_info("AMD Secure Memory Encryption (SME) active\n"); +} + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) +{ + WARN(PAGE_ALIGN(size) != size, + "size is not page-aligned (%#lx)\n", size); + + /* Make the SWIOTLB buffer area decrypted */ + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); +} + +static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, + unsigned long end) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = start & PGDIR_MASK; + pgd_end = end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); + pgd_size *= sizeof(pgd_t); + + pgd_p = pgd_base + pgd_index(start); + + memset(pgd_p, 0, pgd_size); +} + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, + unsigned long vaddr, pmdval_t pmd_val) +{ + pgd_t *pgd_p; + p4d_t *p4d_p; + pud_t *pud_p; + pmd_t *pmd_p; + + pgd_p = pgd_base + pgd_index(vaddr); + if (native_pgd_val(*pgd_p)) { + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + else + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + } else { + pgd_t pgd; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p = pgtable_area; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); + } else { + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); + } + native_set_pgd(pgd_p, pgd); + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p += p4d_index(vaddr); + if (native_p4d_val(*p4d_p)) { + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); + } else { + p4d_t p4d; + + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); + native_set_p4d(p4d_p, p4d); + } + } + + pud_p += pud_index(vaddr); + if (native_pud_val(*pud_p)) { + if (native_pud_val(*pud_p) & _PAGE_PSE) + goto out; + + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); + } else { + pud_t pud; + + pmd_p = pgtable_area; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); + native_set_pud(pud_p, pud); + } + + pmd_p += pmd_index(vaddr); + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) + native_set_pmd(pmd_p, native_make_pmd(pmd_val)); + +out: + return pgtable_area; +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long p4d_size, pud_size, pmd_size; + unsigned long total; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. That mappings will be covered by 2MB + * PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. Incrementing the count for each covers the case where + * the addresses cross entries. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total = p4d_size + pud_size + pmd_size; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total += p4d_size + pud_size + pmd_size; + + return total; +} + +void __init sme_encrypt_kernel(void) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long pgtable_area_len; + unsigned long paddr, pmd_flags; + unsigned long decrypted_base; + void *pgtable_area; + pgd_t *pgd; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel by building new pagetables with + * the necessary attributes needed to encrypt the kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = workarea_start + workarea_len; + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + pgd = (pgd_t *)native_read_cr3_pa(); + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * to be encrypted. It starts with an empty PGD that will then be + * populated with new PUDs and PMDs as the encrypted and decrypted + * kernel mappings are created. + */ + pgd = pgtable_area; + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); + pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; + + /* Add encrypted kernel (identity) mappings */ + pmd_flags = PMD_FLAGS | _PAGE_ENC; + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base <<= PGDIR_SHIFT; + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* Add decrypted workarea mappings to both kernel mappings */ + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + sme_clear_pgd(pgd, kernel_start + decrypted_base, + kernel_end + decrypted_base); + + sme_clear_pgd(pgd, workarea_start + decrypted_base, + workarea_end + decrypted_base); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init __nostackprotector sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + + /* + * Check for the SME feature: + * CPUID Fn8000_001F[EAX] - Bit 0 + * Secure Memory Encryption support + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & 1)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if SME is enabled */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S new file mode 100644 index 000000000000..730e6d541df1 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -0,0 +1,149 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> + + .text + .code64 +ENTRY(sme_encrypt_execute) + + /* + * Entry parameters: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - length of kernel + * RCX - virtual address of the encryption workarea, including: + * - stack page (PAGE_SIZE) + * - encryption routine page (PAGE_SIZE) + * - intermediate copy buffer (PMD_PAGE_SIZE) + * R8 - physcial address of the pagetables to use for encryption + */ + + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ + + /* Set up a one page stack in the non-encrypted memory area */ + movq %rcx, %rax /* Workarea stack page */ + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ + + push %r12 + movq %rdi, %r10 /* Encrypted kernel */ + movq %rsi, %r11 /* Decrypted kernel */ + movq %rdx, %r12 /* Kernel length */ + + /* Copy encryption routine into the workarea */ + movq %rax, %rdi /* Workarea encryption routine */ + leaq __enc_copy(%rip), %rsi /* Encryption routine */ + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ + rep movsb + + /* Setup registers for call */ + movq %r10, %rdi /* Encrypted kernel */ + movq %r11, %rsi /* Decrypted kernel */ + movq %r8, %rdx /* Pagetables used for encryption */ + movq %r12, %rcx /* Kernel length */ + movq %rax, %r8 /* Workarea encryption routine */ + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + + call *%rax /* Call the encryption routine */ + + pop %r12 + + movq %rbp, %rsp /* Restore original stack pointer */ + pop %rbp + + ret +ENDPROC(sme_encrypt_execute) + +ENTRY(__enc_copy) +/* + * Routine used to encrypt kernel. + * This routine must be run outside of the kernel proper since + * the kernel will be encrypted during the process. So this + * routine is defined here and then copied to an area outside + * of the kernel where it will remain and run decrypted + * during execution. + * + * On entry the registers must be: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - address of the pagetables to use for encryption + * RCX - length of kernel + * R8 - intermediate copy buffer + * + * RAX - points to this routine + * + * The kernel will be encrypted by copying from the non-encrypted + * kernel space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted kernel space. The physical + * addresses of the two kernel space mappings are the same which + * results in the kernel being encrypted "in place". + */ + /* Enable the new page tables */ + mov %rdx, %cr3 + + /* Flush any global TLBs */ + mov %cr4, %rdx + andq $~X86_CR4_PGE, %rdx + mov %rdx, %cr4 + orq $X86_CR4_PGE, %rdx + mov %rdx, %cr4 + + /* Set the PAT register PA5 entry to write-protect */ + push %rcx + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + push %rdx /* Save original PAT value */ + andl $0xffff00ff, %edx /* Clear PA5 */ + orl $0x00000500, %edx /* Set PA5 to WP */ + wrmsr + pop %rdx /* RDX contains original PAT value */ + pop %rcx + + movq %rcx, %r9 /* Save kernel length */ + movq %rdi, %r10 /* Save encrypted kernel address */ + movq %rsi, %r11 /* Save decrypted kernel address */ + + wbinvd /* Invalidate any cache entries */ + + /* Copy/encrypt 2MB at a time */ +1: + movq %r11, %rsi /* Source - decrypted kernel */ + movq %r8, %rdi /* Dest - intermediate copy buffer */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + movq %r8, %rsi /* Source - intermediate copy buffer */ + movq %r10, %rdi /* Dest - encrypted kernel */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + addq $PMD_PAGE_SIZE, %r11 + addq $PMD_PAGE_SIZE, %r10 + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + jnz 1b /* Kernel length not zero? */ + + /* Restore PAT register */ + push %rdx /* Save original PAT value */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + pop %rdx /* Restore original PAT value */ + wrmsr + + ret +.L__enc_copy_end: +ENDPROC(__enc_copy) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 62474ba66c8e..4e1f6e1b8159 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_MM_INTERNAL_H #define __X86_MM_INTERNAL_H diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index a88cfbfbd078..a99679826846 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -37,21 +37,21 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -unsigned long tasksize_32bit(void) +unsigned long task_size_32bit(void) { return IA32_PAGE_OFFSET; } -unsigned long tasksize_64bit(void) +unsigned long task_size_64bit(int full_addr_space) { - return TASK_SIZE_MAX; + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; } static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if (current->flags & PF_RANDOMIZE) { - max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); max <<= PAGE_SHIFT; } @@ -141,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -151,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), tasksize_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit()); #endif } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1c34b767c84c..7eb06701a935 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mpx.c - Memory Protection eXtensions * @@ -355,10 +356,19 @@ int mpx_enable_management(void) */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); + + /* MPX doesn't support addresses above 47 bits yet. */ + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { + pr_warn_once("%s (%d): MPX cannot handle addresses " + "above 47-bits. Disabling.", + current->comm, current->pid); + ret = -ENXIO; + goto out; + } mm->context.bd_addr = bd_base; if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; - +out: up_write(&mm->mmap_sem); return ret; } @@ -1030,3 +1040,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, if (ret) force_sig(SIGSEGV, current); } + +/* MPX cannot handle addresses above 47 bits yet. */ +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (!kernel_managing_mpx_tables(current->mm)) + return addr; + if (addr + len <= DEFAULT_MAP_WINDOW) + return addr; + if (flags & MAP_FIXED) + return -ENOMEM; + + /* + * Requested len is larger than the whole area we're allowed to map in. + * Resetting hinting address wouldn't do much good -- fail early. + */ + if (len > DEFAULT_MAP_WINDOW) + return -ENOMEM; + + /* Look for unmap area within DEFAULT_MAP_WINDOW */ + return 0; +} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9405ffc91502..066f3511d5f1 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..34a2a3bfde9c 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NUMA emulation */ @@ -75,13 +76,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. + * to max_addr. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 size; int big; int nid = 0; @@ -116,9 +119,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. @@ -200,13 +200,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 min_size; int nid = 0; int i, ret; @@ -231,9 +233,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -280,6 +279,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + /** * numa_emulation - Emulate NUMA nodes * @numa_meminfo: NUMA configuration to massage @@ -376,23 +391,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * Determine the max emulated nid and the default phys nid to use * for unmapped nodes. */ - max_emu_nid = 0; - dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (dfl_phys_nid == NUMA_NO_NODE) - dfl_phys_nid = emu_nid_to_phys[i]; - } - } - if (dfl_phys_nid == NUMA_NO_NODE) { - pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); - goto no_emu; - } + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); /* commit */ *numa_meminfo = ei; + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index ad86ec91e640..86860f279662 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_MM_NUMA_INTERNAL_H #define __X86_MM_NUMA_INTERNAL_H diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 5f169d5d76a8..a25588ad75ef 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * self test for change_page_attr. * diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 757b0bcdf712..dfb7d657cf43 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages) __pgprot(0), 1, 0, NULL); } +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + struct cpa_data cpa; + unsigned long start; + int ret; + + /* Nothing to do if the SME is not active */ + if (!sme_active()) + return 0; + + /* Should not be working on unaligned addresses */ + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) + addr &= PAGE_MASK; + + start = addr; + + memset(&cpa, 0, sizeof(cpa)); + cpa.vaddr = &addr; + cpa.numpages = numpages; + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); + cpa.pgd = init_mm.pgd; + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + vm_unmap_aliases(); + + /* + * Before changing the encryption attribute, we need to flush caches. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 1); + else + cpa_flush_all(1); + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * After changing the encryption attribute, we need to flush TLBs + * again in case any speculative TLB caching occurred (but no need + * to flush caches again). We could just use cpa_flush_all(), but + * in case TLB flushing gets optimized in the cpa_flush_range() + * path use the same logic as above. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 0); + else + cpa_flush_all(0); + + return ret; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, true); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, false); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 45979502f64b..fe7d57a8fb60 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -293,7 +293,7 @@ void init_cache_modes(void) * pat_init - Initialize PAT MSR and PAT table * * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC and WT. + * to enable additional cache attributes, WC, WT and WP. * * This function must be called on all CPUs using the specific sequence of * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this @@ -352,7 +352,7 @@ void pat_init(void) * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved - * 101 5 WC : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * @@ -360,7 +360,7 @@ void pat_init(void) * corresponding types in the presence of PAT errata. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { @@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) + vma_prot = pgprot_decrypted(vma_prot); + return vma_prot; } diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index a739bfc40690..eeb5caeb089b 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PAT_INTERNAL_H_ #define __PAT_INTERNAL_H_ diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index d76485b22824..fa16036fa592 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Handle caching attributes in page tables (PAT) * diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 508a708eb9a6..17ebc5a978cc 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <asm/pgalloc.h> @@ -56,7 +57,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); + tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -72,21 +73,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_page(tlb, page); + tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pud)); + tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(p4d)); + tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -426,10 +427,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma, { int changed = !pte_same(*ptep, entry); - if (changed && dirty) { + if (changed && dirty) *ptep = entry; - pte_update(vma->vm_mm, address, ptep); - } return changed; } @@ -486,9 +485,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); - if (ret) - pte_update(vma->vm_mm, addr, ptep); - return ret; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b9bd5b8b14fa..6b9bf023a700 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index cfc3b9121ce4..7f9acb68324c 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bootmem.h> #include <linux/mmdebug.h> #include <linux/export.h> diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h index a3cd5a0c97b3..9f6419cafc32 100644 --- a/arch/x86/mm/physaddr.h +++ b/arch/x86/mm/physaddr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm/processor.h> static inline int phys_addr_valid(resource_size_t addr) diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2dab69a706ec..d7bc0eea20a5 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,7 +18,6 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/mmu_context.h> /* vma_pkey() */ -#include <asm/fpu/internal.h> /* fpregs_active() */ int __execute_only_pkey(struct mm_struct *mm) { @@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - fpregs_active() && + current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index f65a33f505b6..adb3c5784dac 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/spinlock.h> #include <linux/errno.h> #include <linux/init.h> diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 3ea20d61b523..dac07e4f5834 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ACPI 3.0 based NUMA setup * Copyright 2004 Andi Kleen, SuSE Labs. diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 014d07a80053..3118392cdf75 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,43 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + + +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) +{ + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -43,8 +80,8 @@ void leave_mm(int cpu) if (loaded_mm == &init_mm) return; - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); + /* Warn if we're not lazy. */ + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); switch_mm(NULL, &init_mm, NULL); } @@ -63,115 +100,308 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned cpu = smp_processor_id(); struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; /* - * NB: The scheduler will call us with prev == next when - * switching from lazy TLB mode to normal mode if active_mm - * isn't changing. When this happens, there is no guarantee - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + * + * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() + * isn't free. + */ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { + /* + * If we were to BUG here, we'd be very likely to kill + * the system so hard that we don't see the call trace. + * Try to recover instead by ignoring the error and doing + * a global flush to minimize the chance of corruption. + * + * (This is far from being a fully correct recovery. + * Architecturally, the CPU could prefetch something + * back into an incorrect ASID slot and leave it there + * to cause trouble down the road. It's better than + * nothing, though.) + */ + __flush_tlb_all(); + } +#endif + this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + /* - * There's nothing to do: we always keep the per-mm control - * regs in sync with cpu_tlbstate.loaded_mm. Just - * sanity-check mm_cpumask. + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); + return; - } + } else { + u16 new_asid; + bool need_flush; + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int index = pgd_index(current_stack_pointer); + pgd_t *pgd = next->pgd + index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[index]); + } + + /* Stop remote flushes for the previous mm */ + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && + real_prev != &init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. + * Start remote flushes and then read tlb_gen. */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + write_cr3(build_cr3(next, new_asid)); + + /* + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + write_cr3(build_cr3_noflush(next, new_asid)); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } - pgd_t *pgd = next->pgd + stack_pgd_index; + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + } + + load_mm_cr4(next); + switch_ldt(real_prev, next); +} + +/* + * Please ignore the name of this function. It should be called + * switch_to_kernel_thread(). + * + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + if (tlb_defer_switch_to_init_mm()) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); } +} - this_cpu_write(cpu_tlbstate.loaded_mm, next); +/* + * Call this when reinitializing a CPU. It fixes the following potential + * problems: + * + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. + * + * - The TLB contains junk in slots corresponding to inactive ASIDs. + * + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. + */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - cpumask_set_cpu(cpu, mm_cpumask(next)); + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) */ - load_cr3(next->pgd); + WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); - /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we have to - * call the tracepoint specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + /* Force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm, 0)); - /* Stop flush ipis for the previous mm */ - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); - /* Load per-mm CR4 and LDTR state */ - load_mm_cr4(next); - switch_ldt(real_prev, next); + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); } +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ static void flush_tlb_func_common(const struct flush_tlb_info *f, bool local, enum tlb_flush_reason reason) { + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { - leave_mm(smp_processor_id()); + if (unlikely(loaded_mm == &init_mm)) + return; + + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (this_cpu_read(cpu_tlbstate.is_lazy)) { + /* + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. + */ + switch_mm_irqs_off(NULL, &init_mm, NULL); return; } - if (f->end == TLB_FLUSH_ALL) { - local_flush_tlb(); - if (local) - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); - } else { + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ unsigned long addr; unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + addr = f->start; while (addr < f->end) { __flush_tlb_single(addr); @@ -180,7 +410,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) @@ -214,6 +453,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); @@ -250,8 +504,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); - /* Synchronize with switch_mm. */ - smp_mb(); + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && @@ -273,6 +527,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), &info); + put_cpu(); } @@ -281,8 +536,6 @@ static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -335,6 +588,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) flush_tlb_others(&batch->cpumask, &info); + cpumask_clear(&batch->cpumask); put_cpu(); |