From 1d8ca3be86ebc6a38dad8236f45c7a9c61681e78 Mon Sep 17 00:00:00 2001
From: Waiman Long
Date: Tue, 6 Nov 2018 15:12:29 -0500
Subject: x86/mm/fault: Allow stack access below %rsp

The current x86 page fault handler allows stack access below the stack
pointer if it is no more than 64k+256 bytes. Any access beyond the 64k+
limit will cause a segmentation fault.

The gcc -fstack-check option generates code to probe the stack for
large stack allocation to see if the stack is accessible. The newer gcc
does that while updating the %rsp simultaneously. Older gcc's like gcc4
doesn't do that. As a result, an application compiled with an old gcc
and the -fstack-check option may fail to start at all:

  $ cat test.c
  int main() {
	char tmp[1024*128];
	printf("### ok\n");
	return 0;
  }

  $ gcc -fstack-check -g -o test test.c

  $ ./test
  Segmentation fault

The old binary was working in older kernels where expand_stack() was
somehow called before the check. But it is not working in newer kernels.
Besides, the 64k+ limit check is kind of crude and will not catch a
lot of mistakes that userspace applications may be misbehaving anyway.
I think the kernel isn't the right place for this kind of tests. We
should leave it to userspace instrumentation tools to perform them.

The 64k+ limit check is now removed to just let expand_stack() decide
if a segmentation fault should happen, when the RLIMIT_STACK limit is
exceeded, for example.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1541535149-31963-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 71d4b9d4d43f..29525cf21100 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1380,18 +1380,6 @@ retry:
 		bad_area(regs, sw_error_code, address);
 		return;
 	}
-	if (sw_error_code & X86_PF_USER) {
-		/*
-		 * Accessing the stack below %sp is always a bug.
-		 * The large cushion allows instructions like enter
-		 * and pusha to work. ("enter $65535, $31" pushes
-		 * 32 pointers and then decrements %sp by 65535.)
-		 */
-		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
-			bad_area(regs, sw_error_code, address);
-			return;
-		}
-	}
 	if (unlikely(expand_stack(vma, address))) {
 		bad_area(regs, sw_error_code, address);
 		return;
-- 
cgit v1.2.3-55-g7522


From 6344be608c039f3a787f1144c46fcb04c0f76561 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 19 Nov 2018 14:45:25 -0800
Subject: x86/fault: Check user_mode(regs) when avoiding an mmap_sem deadlock

The fault-handling code that takes mmap_sem needs to avoid a
deadlock that could occur if the kernel took a bad (OOPS-worthy)
page fault on a user address while holding mmap_sem.  This can only
happen if the faulting instruction was in the kernel
(i.e. user_mode(regs)).  Rather than checking the sw_error_code
(which will have the USER bit set if the fault was a USER-permission
access *or* if user_mode(regs)), just check user_mode(regs)
directly.

The old code would have malfunctioned if the kernel executed a bogus
WRUSS instruction while holding mmap_sem.  Fortunately, that is
extremely unlikely in current kernels, which don't use WRUSS.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/4b89b542e8ceba9bd6abde2f386afed6d99244a9.1542667307.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 29525cf21100..8624cb7d8d65 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1344,13 +1344,10 @@ void do_user_addr_fault(struct pt_regs *regs,
 	 * Only do the expensive exception table search when we might be at
 	 * risk of a deadlock.  This happens if we
 	 * 1. Failed to acquire mmap_sem, and
-	 * 2. The access did not originate in userspace.  Note: either the
-	 *    hardware or earlier page fault code may set X86_PF_USER
-	 *    in sw_error_code.
+	 * 2. The access did not originate in userspace.
 	 */
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
-		if (!(sw_error_code & X86_PF_USER) &&
-		    !search_exception_tables(regs->ip)) {
+		if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
 			/*
 			 * Fault from code in kernel from
 			 * which we do not expect faults.
-- 
cgit v1.2.3-55-g7522


From dae0a10593007d049ea71601357ac41d4f247ee9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 19 Nov 2018 14:45:27 -0800
Subject: x86/cpufeatures, x86/fault: Mark SMAP as disabled when configured out

Add X86_FEATURE_SMAP to the disabled features mask as appropriate
and use cpu_feature_enabled() in the fault code.  This lets us get
rid of a redundant IS_ENABLED(CONFIG_X86_SMAP).

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/fe93332eded3d702f0b0b4cf83928d6830739ba3.1542667307.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/disabled-features.h | 8 +++++++-
 arch/x86/mm/fault.c                      | 5 +----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 33833d1909af..a5ea841cc6d2 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -16,6 +16,12 @@
 # define DISABLE_MPX	(1<<(X86_FEATURE_MPX & 31))
 #endif
 
+#ifdef CONFIG_X86_SMAP
+# define DISABLE_SMAP	0
+#else
+# define DISABLE_SMAP	(1<<(X86_FEATURE_SMAP & 31))
+#endif
+
 #ifdef CONFIG_X86_INTEL_UMIP
 # define DISABLE_UMIP	0
 #else
@@ -68,7 +74,7 @@
 #define DISABLED_MASK6	0
 #define DISABLED_MASK7	(DISABLE_PTI)
 #define DISABLED_MASK8	0
-#define DISABLED_MASK9	(DISABLE_MPX)
+#define DISABLED_MASK9	(DISABLE_MPX|DISABLE_SMAP)
 #define DISABLED_MASK10	0
 #define DISABLED_MASK11	0
 #define DISABLED_MASK12	0
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8624cb7d8d65..39e39cd42097 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1150,10 +1150,7 @@ static int fault_in_kernel_space(unsigned long address)
 
 static inline bool smap_violation(int error_code, struct pt_regs *regs)
 {
-	if (!IS_ENABLED(CONFIG_X86_SMAP))
-		return false;
-
-	if (!static_cpu_has(X86_FEATURE_SMAP))
+	if (!cpu_feature_enabled(X86_FEATURE_SMAP))
 		return false;
 
 	if (error_code & X86_PF_USER)
-- 
cgit v1.2.3-55-g7522


From a15781b536293edc32bf374233f3b8ad77c3f72b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 19 Nov 2018 14:45:28 -0800
Subject: x86/fault: Fold smap_violation() into do_user_addr_fault()

smap_violation() has a single caller, and the contents are a bit
nonsensical.  I'm going to fix it, but first let's fold it into its
caller for ease of comprehension.

In this particular case, the user_mode(regs) check is incorrect --
it will cause false positives in the case of a user-initiated
kernel-privileged access.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/806c366f6ca861152398ce2c01744d59d9aceb6d.1542667307.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 39e39cd42097..9d092ab74f18 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1148,20 +1148,6 @@ static int fault_in_kernel_space(unsigned long address)
 	return address >= TASK_SIZE_MAX;
 }
 
-static inline bool smap_violation(int error_code, struct pt_regs *regs)
-{
-	if (!cpu_feature_enabled(X86_FEATURE_SMAP))
-		return false;
-
-	if (error_code & X86_PF_USER)
-		return false;
-
-	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
-		return false;
-
-	return true;
-}
-
 /*
  * Called for all faults where 'address' is part of the kernel address
  * space.  Might get called for faults that originate from *code* that
@@ -1249,10 +1235,13 @@ void do_user_addr_fault(struct pt_regs *regs,
 		pgtable_bad(regs, hw_error_code, address);
 
 	/*
-	 * Check for invalid kernel (supervisor) access to user
-	 * pages in the user address space.
+	 * If SMAP is on, check for invalid kernel (supervisor)
+	 * access to user pages in the user address space.
 	 */
-	if (unlikely(smap_violation(hw_error_code, regs))) {
+	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
+		     !(hw_error_code & X86_PF_USER) &&
+		     (user_mode(regs) || !(regs->flags & X86_EFLAGS_AC))))
+	{
 		bad_area_nosemaphore(regs, hw_error_code, address);
 		return;
 	}
-- 
cgit v1.2.3-55-g7522


From e50928d7213e72ee95507221a89ed07d2bb6517b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 19 Nov 2018 14:45:29 -0800
Subject: x86/fault: Fix SMAP #PF handling buglet for implicit supervisor
 accesses

Currently, if a user program somehow triggers an implicit supervisor
access to a user address (e.g. if the kernel somehow sets LDTR to a
user address), it will be incorrectly detected as a SMAP violation
if AC is clear and SMAP is enabled.  This is incorrect -- the error
has nothing to do with SMAP.  Fix the condition so that only
accesses with the hardware USER bit set are diagnosed as SMAP
violations.

With the logic fixed, an implicit supervisor access to a user address
will hit the code lower in the function that is intended to handle it
even if SMAP is enabled.  That logic is still a bit buggy, and later
patches will clean it up.

I *think* this code is still correct for WRUSS, and I've added a
comment to that effect.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/d1d1b2e66ef31f884dba172084486ea9423ddcdb.1542667307.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d092ab74f18..7a69b66cf071 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1235,12 +1235,15 @@ void do_user_addr_fault(struct pt_regs *regs,
 		pgtable_bad(regs, hw_error_code, address);
 
 	/*
-	 * If SMAP is on, check for invalid kernel (supervisor)
-	 * access to user pages in the user address space.
+	 * If SMAP is on, check for invalid kernel (supervisor) access to user
+	 * pages in the user address space.  The odd case here is WRUSS,
+	 * which, according to the preliminary documentation, does not respect
+	 * SMAP and will have the USER bit set so, in all cases, SMAP
+	 * enforcement appears to be consistent with the USER bit.
 	 */
 	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
 		     !(hw_error_code & X86_PF_USER) &&
-		     (user_mode(regs) || !(regs->flags & X86_EFLAGS_AC))))
+		     !(regs->flags & X86_EFLAGS_AC)))
 	{
 		bad_area_nosemaphore(regs, hw_error_code, address);
 		return;
-- 
cgit v1.2.3-55-g7522


From 6ea59b074f15e7ef4b042a108950861b383e7b02 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 19 Nov 2018 14:45:30 -0800
Subject: x86/fault: Improve the condition for signalling vs OOPSing

__bad_area_nosemaphore() currently checks the X86_PF_USER bit in the
error code to decide whether to send a signal or to treat the fault
as a kernel error.  This can cause somewhat erratic behavior.  The
straightforward cases where the CPL agrees with the hardware USER
bit are all correct, but the other cases are confusing.

 - A user instruction accessing a kernel address with supervisor
   privilege (e.g. a descriptor table access failed).  The USER bit
   will be clear, and we OOPS.  This is correct, because it indicates
   a kernel bug, not a user error.

 - A user instruction accessing a user address with supervisor
   privilege (e.g. a descriptor table was incorrectly pointing at
   user memory).  __bad_area_nosemaphore() will be passed a modified
   error code with the user bit set, and we will send a signal.
   Sending the signal will work (because the regs and the entry
   frame genuinely come from user mode), but we really ought to
   OOPS, as this event indicates a severe kernel bug.

 - A kernel instruction with user privilege (i.e. WRUSS).  This
   should OOPS or get fixed up.  The current code would instead try
   send a signal and malfunction.

Change the logic: a signal should be sent if the faulting context is
user mode *and* the access has user privilege.  Otherwise it's
either a kernel mode fault or a failed implicit access, either of
which should end up in no_context().

Note to -stable maintainers: don't backport this unless you backport
CET.  The bug it fixes is unobservable in current kernels unless
something is extremely wrong.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/10e509c43893170e262e82027ea399130ae81159.1542667307.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7a69b66cf071..3c9aed03d18e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -794,7 +794,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 
 	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & X86_PF_USER) {
+	if (user_mode(regs) && (error_code & X86_PF_USER)) {
 		/*
 		 * It's possible to have interrupts off here:
 		 */
-- 
cgit v1.2.3-55-g7522


From e49d3cbef0176c182b86206185f137a87f16ab91 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 19 Nov 2018 14:45:31 -0800
Subject: x86/fault: Make error_code sanitization more robust

The error code in a page fault on a kernel address indicates
whether that address is mapped, which should not be revealed in a signal.

The normal code path for a page fault on a kernel address sanitizes the bit,
but the paths for vsyscall emulation and SIGBUS do not.  Both are
harmless, but for subtle reasons.  SIGBUS is never sent for a kernel
address, and vsyscall emulation will never fault on a kernel address
per se because it will fail an access_ok() check instead.

Make the code more robust by adding a helper that sets the relevant
fields and sanitizing the error code in the helper.  This also
cleans up the code -- we had three copies of roughly the same thing.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/b31159bd55bd0c4fa061a20dfd6c429c094bebaa.1542667307.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3c9aed03d18e..b5ec1ca2f4a0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -631,6 +631,24 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 	oops_end(flags, regs, sig);
 }
 
+static void set_signal_archinfo(unsigned long address,
+				unsigned long error_code)
+{
+	struct task_struct *tsk = current;
+
+	/*
+	 * To avoid leaking information about the kernel page
+	 * table layout, pretend that user-mode accesses to
+	 * kernel addresses are always protection faults.
+	 */
+	if (address >= TASK_SIZE_MAX)
+		error_code |= X86_PF_PROT;
+
+	tsk->thread.trap_nr = X86_TRAP_PF;
+	tsk->thread.error_code = error_code | X86_PF_USER;
+	tsk->thread.cr2 = address;
+}
+
 static noinline void
 no_context(struct pt_regs *regs, unsigned long error_code,
 	   unsigned long address, int signal, int si_code)
@@ -656,9 +674,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		 * faulting through the emulate_vsyscall() logic.
 		 */
 		if (current->thread.sig_on_uaccess_err && signal) {
-			tsk->thread.trap_nr = X86_TRAP_PF;
-			tsk->thread.error_code = error_code | X86_PF_USER;
-			tsk->thread.cr2 = address;
+			set_signal_archinfo(address, error_code);
 
 			/* XXX: hwpoison faults will set the wrong code. */
 			force_sig_fault(signal, si_code, (void __user *)address,
@@ -821,9 +837,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		if (likely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
 
-		tsk->thread.cr2		= address;
-		tsk->thread.error_code	= error_code;
-		tsk->thread.trap_nr	= X86_TRAP_PF;
+		set_signal_archinfo(address, error_code);
 
 		if (si_code == SEGV_PKUERR)
 			force_sig_pkuerr((void __user *)address, pkey);
@@ -937,9 +951,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	if (is_prefetch(regs, error_code, address))
 		return;
 
-	tsk->thread.cr2		= address;
-	tsk->thread.error_code	= error_code;
-	tsk->thread.trap_nr	= X86_TRAP_PF;
+	set_signal_archinfo(address, error_code);
 
 #ifdef CONFIG_MEMORY_FAILURE
 	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
-- 
cgit v1.2.3-55-g7522


From 1ad33f5aec20f53785dbad44c6fb3b204aefd921 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 19 Nov 2018 14:45:32 -0800
Subject: x86/fault: Don't set thread.cr2, etc before OOPSing

The fault handling code sets the cr2, trap_nr, and error_code fields
in thread_struct before OOPSing.  No one reads those fields during
an OOPS, so remove the code to set them.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/d418022aa0fad9cb40467aa7acaf4e95be50ee96.1542667307.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b5ec1ca2f4a0..b898a38093a3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -621,10 +621,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 	       tsk->comm, address);
 	dump_pagetable(address);
 
-	tsk->thread.cr2		= address;
-	tsk->thread.trap_nr	= X86_TRAP_PF;
-	tsk->thread.error_code	= error_code;
-
 	if (__die("Bad pagetable", regs, error_code))
 		sig = 0;
 
@@ -753,10 +749,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	if (task_stack_end_corrupted(tsk))
 		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 
-	tsk->thread.cr2		= address;
-	tsk->thread.trap_nr	= X86_TRAP_PF;
-	tsk->thread.error_code	= error_code;
-
 	sig = SIGKILL;
 	if (__die("Oops", regs, error_code))
 		sig = 0;
-- 
cgit v1.2.3-55-g7522


From 0ed32f1aa66ee758e6c8164f549f7ff9d399a20e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 21 Nov 2018 15:11:22 -0800
Subject: x86/fault: Remove sw_error_code

All of the fault handling code now corrently checks user_mode(regs)
as needed, and nothing depends on the X86_PF_USER bit being munged.
Get rid of the sw_error code and use hw_error_code everywhere.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/078f5b8ae6e8c79ff8ee7345b5c476c45003e5ac.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 50 +++++++++++---------------------------------------
 1 file changed, 11 insertions(+), 39 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b898a38093a3..82881bc5feef 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1217,7 +1217,6 @@ void do_user_addr_fault(struct pt_regs *regs,
 			unsigned long hw_error_code,
 			unsigned long address)
 {
-	unsigned long sw_error_code;
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -1262,13 +1261,6 @@ void do_user_addr_fault(struct pt_regs *regs,
 		return;
 	}
 
-	/*
-	 * hw_error_code is literally the "page fault error code" passed to
-	 * the kernel directly from the hardware.  But, we will shortly be
-	 * modifying it in software, so give it a new name.
-	 */
-	sw_error_code = hw_error_code;
-
 	/*
 	 * It's safe to allow irq's after cr2 has been saved and the
 	 * vmalloc fault has been handled.
@@ -1278,26 +1270,6 @@ void do_user_addr_fault(struct pt_regs *regs,
 	 */
 	if (user_mode(regs)) {
 		local_irq_enable();
-		/*
-		 * Up to this point, X86_PF_USER set in hw_error_code
-		 * indicated a user-mode access.  But, after this,
-		 * X86_PF_USER in sw_error_code will indicate either
-		 * that, *or* an implicit kernel(supervisor)-mode access
-		 * which originated from user mode.
-		 */
-		if (!(hw_error_code & X86_PF_USER)) {
-			/*
-			 * The CPU was in user mode, but the CPU says
-			 * the fault was not a user-mode access.
-			 * Must be an implicit kernel-mode access,
-			 * which we do not expect to happen in the
-			 * user address space.
-			 */
-			pr_warn_once("kernel-mode error from user-mode: %lx\n",
-					hw_error_code);
-
-			sw_error_code |= X86_PF_USER;
-		}
 		flags |= FAULT_FLAG_USER;
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
@@ -1306,9 +1278,9 @@ void do_user_addr_fault(struct pt_regs *regs,
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
-	if (sw_error_code & X86_PF_WRITE)
+	if (hw_error_code & X86_PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
-	if (sw_error_code & X86_PF_INSTR)
+	if (hw_error_code & X86_PF_INSTR)
 		flags |= FAULT_FLAG_INSTRUCTION;
 
 #ifdef CONFIG_X86_64
@@ -1321,7 +1293,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 	 * The vsyscall page does not have a "real" VMA, so do this
 	 * emulation before we go searching for VMAs.
 	 */
-	if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
+	if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
 		if (emulate_vsyscall(regs, address))
 			return;
 	}
@@ -1345,7 +1317,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 			 * Fault from code in kernel from
 			 * which we do not expect faults.
 			 */
-			bad_area_nosemaphore(regs, sw_error_code, address);
+			bad_area_nosemaphore(regs, hw_error_code, address);
 			return;
 		}
 retry:
@@ -1361,17 +1333,17 @@ retry:
 
 	vma = find_vma(mm, address);
 	if (unlikely(!vma)) {
-		bad_area(regs, sw_error_code, address);
+		bad_area(regs, hw_error_code, address);
 		return;
 	}
 	if (likely(vma->vm_start <= address))
 		goto good_area;
 	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
-		bad_area(regs, sw_error_code, address);
+		bad_area(regs, hw_error_code, address);
 		return;
 	}
 	if (unlikely(expand_stack(vma, address))) {
-		bad_area(regs, sw_error_code, address);
+		bad_area(regs, hw_error_code, address);
 		return;
 	}
 
@@ -1380,8 +1352,8 @@ retry:
 	 * we can handle it..
 	 */
 good_area:
-	if (unlikely(access_error(sw_error_code, vma))) {
-		bad_area_access_error(regs, sw_error_code, address, vma);
+	if (unlikely(access_error(hw_error_code, vma))) {
+		bad_area_access_error(regs, hw_error_code, address, vma);
 		return;
 	}
 
@@ -1420,13 +1392,13 @@ good_area:
 			return;
 
 		/* Not returning to user mode? Handle exceptions or die: */
-		no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
+		no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
 		return;
 	}
 
 	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		mm_fault_error(regs, sw_error_code, address, fault);
+		mm_fault_error(regs, hw_error_code, address, fault);
 		return;
 	}
 
-- 
cgit v1.2.3-55-g7522


From ebb53e2597e2dc7637ab213df006e99681b6ee25 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 21 Nov 2018 15:11:23 -0800
Subject: x86/fault: Don't try to recover from an implicit supervisor access

This avoids a situation in which we attempt to apply various fixups
that are not intended to handle implicit supervisor accesses from
user mode if we screw up in a way that causes this type of fault.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/9999f151d72ff352265f3274c5ab3a4105090f49.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 82881bc5feef..ca38bd0472f2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -653,6 +653,15 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	unsigned long flags;
 	int sig;
 
+	if (user_mode(regs)) {
+		/*
+		 * This is an implicit supervisor-mode access from user
+		 * mode.  Bypass all the kernel-mode recovery code and just
+		 * OOPS.
+		 */
+		goto oops;
+	}
+
 	/* Are we prepared to handle this kernel fault? */
 	if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
 		/*
@@ -738,6 +747,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	if (IS_ENABLED(CONFIG_EFI))
 		efi_recover_from_page_fault(address);
 
+oops:
 	/*
 	 * Oops. The kernel tried to access some bad page. We'll have to
 	 * terminate things with extreme prejudice:
-- 
cgit v1.2.3-55-g7522


From a1a371c468f7238b7826fde55786b02377faf8e2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 21 Nov 2018 15:11:25 -0800
Subject: x86/fault: Decode page fault OOPSes better

One of Linus' favorite hobbies seems to be looking at OOPSes and
decoding the error code in his head.  This is not one of my favorite
hobbies :)

Teach the page fault OOPS hander to decode the error code.  If it's
a !USER fault from user mode, print an explicit note to that effect
and print out the addresses of various tables that might cause such
an error.

With this patch applied, if I intentionally point the LDT at 0x0 and
run the x86 selftests, I get:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  HW error: normal kernel read fault
  This was a system access from user code
  IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f)
  LDTR: 0x50 -- base=0x0 limit=0xfff7
  TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f
  PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0
  SMP PTI
  CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317
  Hardware name: ...
  RIP: 0033:0x401454

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/11212acb25980cd1b3030875cd9502414fbb214d.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ca38bd0472f2..f5efbdba2b6d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
 #include <asm/vm86.h>			/* struct vm86			*/
 #include <asm/mmu_context.h>		/* vma_pkey()			*/
 #include <asm/efi.h>			/* efi_recover_from_page_fault()*/
+#include <asm/desc.h>			/* store_idt(), ...		*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -571,10 +572,53 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 	return 0;
 }
 
+static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
+{
+	u32 offset = (index >> 3) * sizeof(struct desc_struct);
+	unsigned long addr;
+	struct ldttss_desc desc;
+
+	if (index == 0) {
+		pr_alert("%s: NULL\n", name);
+		return;
+	}
+
+	if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
+		pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
+		return;
+	}
+
+	if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
+			      sizeof(struct ldttss_desc))) {
+		pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
+			 name, index);
+		return;
+	}
+
+	addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
+#ifdef CONFIG_X86_64
+	addr |= ((u64)desc.base3 << 32);
+#endif
+	pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
+		 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
+}
+
+static void errstr(unsigned long ec, char *buf, unsigned long mask,
+		   const char *txt)
+{
+	if (ec & mask) {
+		if (buf[0])
+			strcat(buf, " ");
+		strcat(buf, txt);
+	}
+}
+
 static void
 show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		unsigned long address)
 {
+	char errtxt[64];
+
 	if (!oops_may_print())
 		return;
 
@@ -602,6 +646,46 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
 		 (void *)address);
 
+	errtxt[0] = 0;
+	errstr(error_code, errtxt, X86_PF_PROT, "PROT");
+	errstr(error_code, errtxt, X86_PF_WRITE, "WRITE");
+	errstr(error_code, errtxt, X86_PF_USER, "USER");
+	errstr(error_code, errtxt, X86_PF_RSVD, "RSVD");
+	errstr(error_code, errtxt, X86_PF_INSTR, "INSTR");
+	errstr(error_code, errtxt, X86_PF_PK, "PK");
+	pr_alert("HW error: %s\n", error_code ? errtxt :
+		 "normal kernel read fault");
+	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
+		struct desc_ptr idt, gdt;
+		u16 ldtr, tr;
+
+		pr_alert("This was a system access from user code\n");
+
+		/*
+		 * This can happen for quite a few reasons.  The more obvious
+		 * ones are faults accessing the GDT, or LDT.  Perhaps
+		 * surprisingly, if the CPU tries to deliver a benign or
+		 * contributory exception from user code and gets a page fault
+		 * during delivery, the page fault can be delivered as though
+		 * it originated directly from user code.  This could happen
+		 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
+		 * kernel or IST stack.
+		 */
+		store_idt(&idt);
+
+		/* Usable even on Xen PV -- it's just slow. */
+		native_store_gdt(&gdt);
+
+		pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
+			 idt.address, idt.size, gdt.address, gdt.size);
+
+		store_ldt(ldtr);
+		show_ldttss(&gdt, "LDTR", ldtr);
+
+		store_tr(tr);
+		show_ldttss(&gdt, "TR", tr);
+	}
+
 	dump_pagetable(address);
 }
 
-- 
cgit v1.2.3-55-g7522


From a2aa52ab16efbee40ad118ebac4a5e438f5b43ee Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 22 Nov 2018 09:34:03 +0100
Subject: x86/fault: Clean up the page fault oops decoder a bit

 - Make the oops messages a bit less scary (don't mention 'HW errors')

 - Turn 'PROT USER' (which is visually easily confused with PROT_USER)
   into individual bit descriptors: "[PROT] [USER]".
   This also makes "[normal kernel read fault]" more apparent.

 - De-abbreviate variables to make the code easier to read

 - Use vertical alignment where appropriate.

 - Add comment about string size limits and the helper function.

 - Remove unnecessary line breaks.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'arch/x86/mm/fault.c')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f5efbdba2b6d..2ff25ad33233 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -603,10 +603,13 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
 		 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
 }
 
-static void errstr(unsigned long ec, char *buf, unsigned long mask,
-		   const char *txt)
+/*
+ * This helper function transforms the #PF error_code bits into
+ * "[PROT] [USER]" type of descriptive, almost human-readable error strings:
+ */
+static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
 {
-	if (ec & mask) {
+	if (error_code & mask) {
 		if (buf[0])
 			strcat(buf, " ");
 		strcat(buf, txt);
@@ -614,10 +617,9 @@ static void errstr(unsigned long ec, char *buf, unsigned long mask,
 }
 
 static void
-show_fault_oops(struct pt_regs *regs, unsigned long error_code,
-		unsigned long address)
+show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 {
-	char errtxt[64];
+	char err_txt[64];
 
 	if (!oops_may_print())
 		return;
@@ -646,15 +648,21 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		 address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
 		 (void *)address);
 
-	errtxt[0] = 0;
-	errstr(error_code, errtxt, X86_PF_PROT, "PROT");
-	errstr(error_code, errtxt, X86_PF_WRITE, "WRITE");
-	errstr(error_code, errtxt, X86_PF_USER, "USER");
-	errstr(error_code, errtxt, X86_PF_RSVD, "RSVD");
-	errstr(error_code, errtxt, X86_PF_INSTR, "INSTR");
-	errstr(error_code, errtxt, X86_PF_PK, "PK");
-	pr_alert("HW error: %s\n", error_code ? errtxt :
-		 "normal kernel read fault");
+	err_txt[0] = 0;
+
+	/*
+	 * Note: length of these appended strings including the separation space and the
+	 * zero delimiter must fit into err_txt[].
+	 */
+	err_str_append(error_code, err_txt, X86_PF_PROT,  "[PROT]" );
+	err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
+	err_str_append(error_code, err_txt, X86_PF_USER,  "[USER]" );
+	err_str_append(error_code, err_txt, X86_PF_RSVD,  "[RSVD]" );
+	err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
+	err_str_append(error_code, err_txt, X86_PF_PK,    "[PK]"   );
+
+	pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
+
 	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
 		struct desc_ptr idt, gdt;
 		u16 ldtr, tr;
-- 
cgit v1.2.3-55-g7522