From 1d8ca3be86ebc6a38dad8236f45c7a9c61681e78 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Nov 2018 15:12:29 -0500 Subject: x86/mm/fault: Allow stack access below %rsp The current x86 page fault handler allows stack access below the stack pointer if it is no more than 64k+256 bytes. Any access beyond the 64k+ limit will cause a segmentation fault. The gcc -fstack-check option generates code to probe the stack for large stack allocation to see if the stack is accessible. The newer gcc does that while updating the %rsp simultaneously. Older gcc's like gcc4 doesn't do that. As a result, an application compiled with an old gcc and the -fstack-check option may fail to start at all: $ cat test.c int main() { char tmp[1024*128]; printf("### ok\n"); return 0; } $ gcc -fstack-check -g -o test test.c $ ./test Segmentation fault The old binary was working in older kernels where expand_stack() was somehow called before the check. But it is not working in newer kernels. Besides, the 64k+ limit check is kind of crude and will not catch a lot of mistakes that userspace applications may be misbehaving anyway. I think the kernel isn't the right place for this kind of tests. We should leave it to userspace instrumentation tools to perform them. The 64k+ limit check is now removed to just let expand_stack() decide if a segmentation fault should happen, when the RLIMIT_STACK limit is exceeded, for example. Signed-off-by: Waiman Long Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1541535149-31963-1-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 71d4b9d4d43f..29525cf21100 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1380,18 +1380,6 @@ retry: bad_area(regs, sw_error_code, address); return; } - if (sw_error_code & X86_PF_USER) { - /* - * Accessing the stack below %sp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535, $31" pushes - * 32 pointers and then decrements %sp by 65535.) - */ - if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, sw_error_code, address); - return; - } - } if (unlikely(expand_stack(vma, address))) { bad_area(regs, sw_error_code, address); return; -- cgit v1.2.3-55-g7522 From 6344be608c039f3a787f1144c46fcb04c0f76561 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:25 -0800 Subject: x86/fault: Check user_mode(regs) when avoiding an mmap_sem deadlock The fault-handling code that takes mmap_sem needs to avoid a deadlock that could occur if the kernel took a bad (OOPS-worthy) page fault on a user address while holding mmap_sem. This can only happen if the faulting instruction was in the kernel (i.e. user_mode(regs)). Rather than checking the sw_error_code (which will have the USER bit set if the fault was a USER-permission access *or* if user_mode(regs)), just check user_mode(regs) directly. The old code would have malfunctioned if the kernel executed a bogus WRUSS instruction while holding mmap_sem. Fortunately, that is extremely unlikely in current kernels, which don't use WRUSS. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/4b89b542e8ceba9bd6abde2f386afed6d99244a9.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 29525cf21100..8624cb7d8d65 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1344,13 +1344,10 @@ void do_user_addr_fault(struct pt_regs *regs, * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we * 1. Failed to acquire mmap_sem, and - * 2. The access did not originate in userspace. Note: either the - * hardware or earlier page fault code may set X86_PF_USER - * in sw_error_code. + * 2. The access did not originate in userspace. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(sw_error_code & X86_PF_USER) && - !search_exception_tables(regs->ip)) { + if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from * which we do not expect faults. -- cgit v1.2.3-55-g7522 From dae0a10593007d049ea71601357ac41d4f247ee9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:27 -0800 Subject: x86/cpufeatures, x86/fault: Mark SMAP as disabled when configured out Add X86_FEATURE_SMAP to the disabled features mask as appropriate and use cpu_feature_enabled() in the fault code. This lets us get rid of a redundant IS_ENABLED(CONFIG_X86_SMAP). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/fe93332eded3d702f0b0b4cf83928d6830739ba3.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/mm/fault.c | 5 +---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33833d1909af..a5ea841cc6d2 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_SMAP +# define DISABLE_SMAP 0 +#else +# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) +#endif + #ifdef CONFIG_X86_INTEL_UMIP # define DISABLE_UMIP 0 #else @@ -68,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_MPX) +#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8624cb7d8d65..39e39cd42097 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1150,10 +1150,7 @@ static int fault_in_kernel_space(unsigned long address) static inline bool smap_violation(int error_code, struct pt_regs *regs) { - if (!IS_ENABLED(CONFIG_X86_SMAP)) - return false; - - if (!static_cpu_has(X86_FEATURE_SMAP)) + if (!cpu_feature_enabled(X86_FEATURE_SMAP)) return false; if (error_code & X86_PF_USER) -- cgit v1.2.3-55-g7522 From a15781b536293edc32bf374233f3b8ad77c3f72b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:28 -0800 Subject: x86/fault: Fold smap_violation() into do_user_addr_fault() smap_violation() has a single caller, and the contents are a bit nonsensical. I'm going to fix it, but first let's fold it into its caller for ease of comprehension. In this particular case, the user_mode(regs) check is incorrect -- it will cause false positives in the case of a user-initiated kernel-privileged access. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/806c366f6ca861152398ce2c01744d59d9aceb6d.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 39e39cd42097..9d092ab74f18 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1148,20 +1148,6 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } -static inline bool smap_violation(int error_code, struct pt_regs *regs) -{ - if (!cpu_feature_enabled(X86_FEATURE_SMAP)) - return false; - - if (error_code & X86_PF_USER) - return false; - - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) - return false; - - return true; -} - /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that @@ -1249,10 +1235,13 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* - * Check for invalid kernel (supervisor) access to user - * pages in the user address space. + * If SMAP is on, check for invalid kernel (supervisor) + * access to user pages in the user address space. */ - if (unlikely(smap_violation(hw_error_code, regs))) { + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && + !(hw_error_code & X86_PF_USER) && + (user_mode(regs) || !(regs->flags & X86_EFLAGS_AC)))) + { bad_area_nosemaphore(regs, hw_error_code, address); return; } -- cgit v1.2.3-55-g7522 From e50928d7213e72ee95507221a89ed07d2bb6517b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:29 -0800 Subject: x86/fault: Fix SMAP #PF handling buglet for implicit supervisor accesses Currently, if a user program somehow triggers an implicit supervisor access to a user address (e.g. if the kernel somehow sets LDTR to a user address), it will be incorrectly detected as a SMAP violation if AC is clear and SMAP is enabled. This is incorrect -- the error has nothing to do with SMAP. Fix the condition so that only accesses with the hardware USER bit set are diagnosed as SMAP violations. With the logic fixed, an implicit supervisor access to a user address will hit the code lower in the function that is intended to handle it even if SMAP is enabled. That logic is still a bit buggy, and later patches will clean it up. I *think* this code is still correct for WRUSS, and I've added a comment to that effect. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d1d1b2e66ef31f884dba172084486ea9423ddcdb.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d092ab74f18..7a69b66cf071 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1235,12 +1235,15 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* - * If SMAP is on, check for invalid kernel (supervisor) - * access to user pages in the user address space. + * If SMAP is on, check for invalid kernel (supervisor) access to user + * pages in the user address space. The odd case here is WRUSS, + * which, according to the preliminary documentation, does not respect + * SMAP and will have the USER bit set so, in all cases, SMAP + * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(hw_error_code & X86_PF_USER) && - (user_mode(regs) || !(regs->flags & X86_EFLAGS_AC)))) + !(regs->flags & X86_EFLAGS_AC))) { bad_area_nosemaphore(regs, hw_error_code, address); return; -- cgit v1.2.3-55-g7522 From 6ea59b074f15e7ef4b042a108950861b383e7b02 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:30 -0800 Subject: x86/fault: Improve the condition for signalling vs OOPSing __bad_area_nosemaphore() currently checks the X86_PF_USER bit in the error code to decide whether to send a signal or to treat the fault as a kernel error. This can cause somewhat erratic behavior. The straightforward cases where the CPL agrees with the hardware USER bit are all correct, but the other cases are confusing. - A user instruction accessing a kernel address with supervisor privilege (e.g. a descriptor table access failed). The USER bit will be clear, and we OOPS. This is correct, because it indicates a kernel bug, not a user error. - A user instruction accessing a user address with supervisor privilege (e.g. a descriptor table was incorrectly pointing at user memory). __bad_area_nosemaphore() will be passed a modified error code with the user bit set, and we will send a signal. Sending the signal will work (because the regs and the entry frame genuinely come from user mode), but we really ought to OOPS, as this event indicates a severe kernel bug. - A kernel instruction with user privilege (i.e. WRUSS). This should OOPS or get fixed up. The current code would instead try send a signal and malfunction. Change the logic: a signal should be sent if the faulting context is user mode *and* the access has user privilege. Otherwise it's either a kernel mode fault or a failed implicit access, either of which should end up in no_context(). Note to -stable maintainers: don't backport this unless you backport CET. The bug it fixes is unobservable in current kernels unless something is extremely wrong. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/10e509c43893170e262e82027ea399130ae81159.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a69b66cf071..3c9aed03d18e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -794,7 +794,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & X86_PF_USER) { + if (user_mode(regs) && (error_code & X86_PF_USER)) { /* * It's possible to have interrupts off here: */ -- cgit v1.2.3-55-g7522 From e49d3cbef0176c182b86206185f137a87f16ab91 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:31 -0800 Subject: x86/fault: Make error_code sanitization more robust The error code in a page fault on a kernel address indicates whether that address is mapped, which should not be revealed in a signal. The normal code path for a page fault on a kernel address sanitizes the bit, but the paths for vsyscall emulation and SIGBUS do not. Both are harmless, but for subtle reasons. SIGBUS is never sent for a kernel address, and vsyscall emulation will never fault on a kernel address per se because it will fail an access_ok() check instead. Make the code more robust by adding a helper that sets the relevant fields and sanitizing the error code in the helper. This also cleans up the code -- we had three copies of roughly the same thing. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/b31159bd55bd0c4fa061a20dfd6c429c094bebaa.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3c9aed03d18e..b5ec1ca2f4a0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -631,6 +631,24 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; + + /* + * To avoid leaking information about the kernel page + * table layout, pretend that user-mode accesses to + * kernel addresses are always protection faults. + */ + if (address >= TASK_SIZE_MAX) + error_code |= X86_PF_PROT; + + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; +} + static noinline void no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) @@ -656,9 +674,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | X86_PF_USER; - tsk->thread.cr2 = address; + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address, @@ -821,9 +837,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); @@ -937,9 +951,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { -- cgit v1.2.3-55-g7522 From 1ad33f5aec20f53785dbad44c6fb3b204aefd921 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 Nov 2018 14:45:32 -0800 Subject: x86/fault: Don't set thread.cr2, etc before OOPSing The fault handling code sets the cr2, trap_nr, and error_code fields in thread_struct before OOPSing. No one reads those fields during an OOPS, so remove the code to set them. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d418022aa0fad9cb40467aa7acaf4e95be50ee96.1542667307.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b5ec1ca2f4a0..b898a38093a3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -621,10 +621,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - if (__die("Bad pagetable", regs, error_code)) sig = 0; @@ -753,10 +749,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; -- cgit v1.2.3-55-g7522 From 0ed32f1aa66ee758e6c8164f549f7ff9d399a20e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:22 -0800 Subject: x86/fault: Remove sw_error_code All of the fault handling code now corrently checks user_mode(regs) as needed, and nothing depends on the X86_PF_USER bit being munged. Get rid of the sw_error code and use hw_error_code everywhere. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/078f5b8ae6e8c79ff8ee7345b5c476c45003e5ac.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 50 +++++++++++--------------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b898a38093a3..82881bc5feef 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1217,7 +1217,6 @@ void do_user_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1262,13 +1261,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * hw_error_code is literally the "page fault error code" passed to - * the kernel directly from the hardware. But, we will shortly be - * modifying it in software, so give it a new name. - */ - sw_error_code = hw_error_code; - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -1278,26 +1270,6 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (user_mode(regs)) { local_irq_enable(); - /* - * Up to this point, X86_PF_USER set in hw_error_code - * indicated a user-mode access. But, after this, - * X86_PF_USER in sw_error_code will indicate either - * that, *or* an implicit kernel(supervisor)-mode access - * which originated from user mode. - */ - if (!(hw_error_code & X86_PF_USER)) { - /* - * The CPU was in user mode, but the CPU says - * the fault was not a user-mode access. - * Must be an implicit kernel-mode access, - * which we do not expect to happen in the - * user address space. - */ - pr_warn_once("kernel-mode error from user-mode: %lx\n", - hw_error_code); - - sw_error_code |= X86_PF_USER; - } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1306,9 +1278,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (sw_error_code & X86_PF_WRITE) + if (hw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (sw_error_code & X86_PF_INSTR) + if (hw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1321,7 +1293,7 @@ void do_user_addr_fault(struct pt_regs *regs, * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. */ - if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { if (emulate_vsyscall(regs, address)) return; } @@ -1345,7 +1317,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, sw_error_code, address); + bad_area_nosemaphore(regs, hw_error_code, address); return; } retry: @@ -1361,17 +1333,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } @@ -1380,8 +1352,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(sw_error_code, vma))) { - bad_area_access_error(regs, sw_error_code, address, vma); + if (unlikely(access_error(hw_error_code, vma))) { + bad_area_access_error(regs, hw_error_code, address, vma); return; } @@ -1420,13 +1392,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, sw_error_code, address, fault); + mm_fault_error(regs, hw_error_code, address, fault); return; } -- cgit v1.2.3-55-g7522 From ebb53e2597e2dc7637ab213df006e99681b6ee25 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:23 -0800 Subject: x86/fault: Don't try to recover from an implicit supervisor access This avoids a situation in which we attempt to apply various fixups that are not intended to handle implicit supervisor accesses from user mode if we screw up in a way that causes this type of fault. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/9999f151d72ff352265f3274c5ab3a4105090f49.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 82881bc5feef..ca38bd0472f2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -653,6 +653,15 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long flags; int sig; + if (user_mode(regs)) { + /* + * This is an implicit supervisor-mode access from user + * mode. Bypass all the kernel-mode recovery code and just + * OOPS. + */ + goto oops; + } + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* @@ -738,6 +747,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_recover_from_page_fault(address); +oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: -- cgit v1.2.3-55-g7522 From a1a371c468f7238b7826fde55786b02377faf8e2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 21 Nov 2018 15:11:25 -0800 Subject: x86/fault: Decode page fault OOPSes better One of Linus' favorite hobbies seems to be looking at OOPSes and decoding the error code in his head. This is not one of my favorite hobbies :) Teach the page fault OOPS hander to decode the error code. If it's a !USER fault from user mode, print an explicit note to that effect and print out the addresses of various tables that might cause such an error. With this patch applied, if I intentionally point the LDT at 0x0 and run the x86 selftests, I get: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 HW error: normal kernel read fault This was a system access from user code IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f) LDTR: 0x50 -- base=0x0 limit=0xfff7 TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0 SMP PTI CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317 Hardware name: ... RIP: 0033:0x401454 Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/11212acb25980cd1b3030875cd9502414fbb214d.1542841400.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ca38bd0472f2..f5efbdba2b6d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include /* struct vm86 */ #include /* vma_pkey() */ #include /* efi_recover_from_page_fault()*/ +#include /* store_idt(), ... */ #define CREATE_TRACE_POINTS #include @@ -571,10 +572,53 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) +{ + u32 offset = (index >> 3) * sizeof(struct desc_struct); + unsigned long addr; + struct ldttss_desc desc; + + if (index == 0) { + pr_alert("%s: NULL\n", name); + return; + } + + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); + return; + } + + if (probe_kernel_read(&desc, (void *)(gdt->address + offset), + sizeof(struct ldttss_desc))) { + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", + name, index); + return; + } + + addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24); +#ifdef CONFIG_X86_64 + addr |= ((u64)desc.base3 << 32); +#endif + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", + name, index, addr, (desc.limit0 | (desc.limit1 << 16))); +} + +static void errstr(unsigned long ec, char *buf, unsigned long mask, + const char *txt) +{ + if (ec & mask) { + if (buf[0]) + strcat(buf, " "); + strcat(buf, txt); + } +} + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + char errtxt[64]; + if (!oops_may_print()) return; @@ -602,6 +646,46 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); + errtxt[0] = 0; + errstr(error_code, errtxt, X86_PF_PROT, "PROT"); + errstr(error_code, errtxt, X86_PF_WRITE, "WRITE"); + errstr(error_code, errtxt, X86_PF_USER, "USER"); + errstr(error_code, errtxt, X86_PF_RSVD, "RSVD"); + errstr(error_code, errtxt, X86_PF_INSTR, "INSTR"); + errstr(error_code, errtxt, X86_PF_PK, "PK"); + pr_alert("HW error: %s\n", error_code ? errtxt : + "normal kernel read fault"); + if (!(error_code & X86_PF_USER) && user_mode(regs)) { + struct desc_ptr idt, gdt; + u16 ldtr, tr; + + pr_alert("This was a system access from user code\n"); + + /* + * This can happen for quite a few reasons. The more obvious + * ones are faults accessing the GDT, or LDT. Perhaps + * surprisingly, if the CPU tries to deliver a benign or + * contributory exception from user code and gets a page fault + * during delivery, the page fault can be delivered as though + * it originated directly from user code. This could happen + * due to wrong permissions on the IDT, GDT, LDT, TSS, or + * kernel or IST stack. + */ + store_idt(&idt); + + /* Usable even on Xen PV -- it's just slow. */ + native_store_gdt(&gdt); + + pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", + idt.address, idt.size, gdt.address, gdt.size); + + store_ldt(ldtr); + show_ldttss(&gdt, "LDTR", ldtr); + + store_tr(tr); + show_ldttss(&gdt, "TR", tr); + } + dump_pagetable(address); } -- cgit v1.2.3-55-g7522 From a2aa52ab16efbee40ad118ebac4a5e438f5b43ee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 22 Nov 2018 09:34:03 +0100 Subject: x86/fault: Clean up the page fault oops decoder a bit - Make the oops messages a bit less scary (don't mention 'HW errors') - Turn 'PROT USER' (which is visually easily confused with PROT_USER) into individual bit descriptors: "[PROT] [USER]". This also makes "[normal kernel read fault]" more apparent. - De-abbreviate variables to make the code easier to read - Use vertical alignment where appropriate. - Add comment about string size limits and the helper function. - Remove unnecessary line breaks. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm/fault.c') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f5efbdba2b6d..2ff25ad33233 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -603,10 +603,13 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } -static void errstr(unsigned long ec, char *buf, unsigned long mask, - const char *txt) +/* + * This helper function transforms the #PF error_code bits into + * "[PROT] [USER]" type of descriptive, almost human-readable error strings: + */ +static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) { - if (ec & mask) { + if (error_code & mask) { if (buf[0]) strcat(buf, " "); strcat(buf, txt); @@ -614,10 +617,9 @@ static void errstr(unsigned long ec, char *buf, unsigned long mask, } static void -show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - char errtxt[64]; + char err_txt[64]; if (!oops_may_print()) return; @@ -646,15 +648,21 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); - errtxt[0] = 0; - errstr(error_code, errtxt, X86_PF_PROT, "PROT"); - errstr(error_code, errtxt, X86_PF_WRITE, "WRITE"); - errstr(error_code, errtxt, X86_PF_USER, "USER"); - errstr(error_code, errtxt, X86_PF_RSVD, "RSVD"); - errstr(error_code, errtxt, X86_PF_INSTR, "INSTR"); - errstr(error_code, errtxt, X86_PF_PK, "PK"); - pr_alert("HW error: %s\n", error_code ? errtxt : - "normal kernel read fault"); + err_txt[0] = 0; + + /* + * Note: length of these appended strings including the separation space and the + * zero delimiter must fit into err_txt[]. + */ + err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); + err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); + err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); + err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); + err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); + err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); + + pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; -- cgit v1.2.3-55-g7522