From 6b90bd4ba40b38dc13c2782469c1c77e4ed79915 Mon Sep 17 00:00:00 2001
From: Emese Revfy
Date: Tue, 24 May 2016 00:09:38 +0200
Subject: GCC plugin infrastructure

This patch allows to build the whole kernel with GCC plugins. It was ported from
grsecurity/PaX. The infrastructure supports building out-of-tree modules and
building in a separate directory. Cross-compilation is supported too.
Currently the x86, arm, arm64 and uml architectures enable plugins.

The directory of the gcc plugins is scripts/gcc-plugins. You can use a file or a directory
there. The plugins compile with these options:
 * -fno-rtti: gcc is compiled with this option so the plugins must use it too
 * -fno-exceptions: this is inherited from gcc too
 * -fasynchronous-unwind-tables: this is inherited from gcc too
 * -ggdb: it is useful for debugging a plugin (better backtrace on internal
    errors)
 * -Wno-narrowing: to suppress warnings from gcc headers (ipa-utils.h)
 * -Wno-unused-variable: to suppress warnings from gcc headers (gcc_version
    variable, plugin-version.h)

The infrastructure introduces a new Makefile target called gcc-plugins. It
supports all gcc versions from 4.5 to 6.0. The scripts/gcc-plugin.sh script
chooses the proper host compiler (gcc-4.7 can be built by either gcc or g++).
This script also checks the availability of the included headers in
scripts/gcc-plugins/gcc-common.h.

The gcc-common.h header contains frequently included headers for GCC plugins
and it has a compatibility layer for the supported gcc versions.

The gcc-generate-*-pass.h headers automatically generate the registration
structures for GIMPLE, SIMPLE_IPA, IPA and RTL passes.

Note that 'make clean' keeps the *.so files (only the distclean or mrproper
targets clean all) because they are needed for out-of-tree modules.

Based on work created by the PaX Team.

Signed-off-by: Emese Revfy <re.emese@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Michal Marek <mmarek@suse.com>
---
 arch/x86/entry/vdso/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 253b72eaade6..f9123163a850 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -75,7 +75,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
        -fno-omit-frame-pointer -foptimize-sibling-calls \
        -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 
-$(vobjs): KBUILD_CFLAGS += $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
 
 #
 # vDSO code runs in userspace and -pg doesn't help with profiling anyway.
@@ -145,6 +145,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
 KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
 KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
 KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
-- 
cgit v1.2.3-55-g7522


From 57f90c3dfc75da89358735c06bf9bc9815b4183e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 18 Jul 2016 14:46:35 -0700
Subject: x86/vdso: Error out if the vDSO isn't a valid DSO

Some distros has been playing with toolchain changes that can affect
the type of ELF objects built.  Occasionally, this goes wrong and
the vDSO ends up not being a DSO at all.  This causes the kernel to
end up broken in a surprisingly subtle way -- glibc apparently
silently ignores a vDSO that isn't a DSO, so everything works,
albeit slowly, until users try a different libc implementation.

Make the kernel build process a bit more robust: fail outright if
the vDSO isn't ET_DYN or is missing its PT_DYNAMIC segment.  I've
never seen this in an unmodified kernel.

See: https://github.com/docker/docker/issues/23378

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/8a30e0a07c3b47ff917a8daa2df5e407cc0c6698.1468878336.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vdso2c.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 63a03bb91497..4f741192846d 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -22,6 +22,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 
 	ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
 
+	if (hdr->e_type != ET_DYN)
+		fail("input is not a shared object\n");
+
 	/* Walk the segment table. */
 	for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
 		if (GET_LE(&pt[i].p_type) == PT_LOAD) {
@@ -49,6 +52,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	if (stripped_len < load_size)
 		fail("stripped input is too short\n");
 
+	if (!dyn)
+		fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n");
+
 	/* Walk the dynamic table */
 	for (i = 0; dyn + i < dyn_end &&
 		     GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
-- 
cgit v1.2.3-55-g7522


From 609c19a385c8744c41f944e2a2d8afe8e8fb860e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 26 Jul 2016 23:12:22 -0700
Subject: x86/ptrace: Stop setting TS_COMPAT in ptrace code

Setting TS_COMPAT in ptrace is wrong: if we happen to do it during
syscall entry, then we'll confuse seccomp and audit.  (The former
isn't a security problem: seccomp is currently entirely insecure if a
malicious ptracer is attached.)  As a minimal fix, this patch adds a
new flag TS_I386_REGS_POKED that handles the ptrace special case.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/5383ebed38b39fa37462139e337aff7f2314d1ca.1469599803.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/common.c            |  6 +++++-
 arch/x86/include/asm/syscall.h     |  5 +----
 arch/x86/include/asm/thread_info.h |  3 +++
 arch/x86/kernel/ptrace.c           | 15 +++++++++------
 arch/x86/kernel/signal.c           | 26 ++++++++++++++++++++++++--
 5 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 9e1e27d31c6d..be8c403f8117 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -270,8 +270,12 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	 * handling, because syscall restart has a fixup for compat
 	 * syscalls.  The fixup is exercised by the ptrace_syscall_32
 	 * selftest.
+	 *
+	 * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
+	 * special case only applies after poking regs and before the
+	 * very next return to user mode.
 	 */
-	ti->status &= ~TS_COMPAT;
+	ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
 #endif
 
 	user_enter_irqoff();
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 999b7cd2e78c..4e23dd15c661 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
 	 * TS_COMPAT is set for 32-bit syscall entries and then
 	 * remains set until we return to user mode.
 	 */
-	if (task_thread_info(task)->status & TS_COMPAT)
+	if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED))
 		/*
 		 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
 		 * and will match correctly in comparisons.
@@ -239,9 +239,6 @@ static inline int syscall_get_arch(void)
 	 * TS_COMPAT is set for 32-bit syscall entry and then
 	 * remains set until we return to user mode.
 	 *
-	 * TIF_IA32 tasks should always have TS_COMPAT set at
-	 * system call time.
-	 *
 	 * x32 tasks should be considered AUDIT_ARCH_X86_64.
 	 */
 	if (task_thread_info(current)->status & TS_COMPAT)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 89bff044a6f5..f856c590c309 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -219,6 +219,9 @@ static inline unsigned long current_stack_pointer(void)
  * have to worry about atomic accesses.
  */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
+#ifdef CONFIG_COMPAT
+#define TS_I386_REGS_POKED	0x0004	/* regs poked by 32-bit ptracer */
+#endif
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 600edd225e81..f79576a541ff 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -923,15 +923,18 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 
 	case offsetof(struct user32, regs.orig_eax):
 		/*
-		 * A 32-bit debugger setting orig_eax means to restore
-		 * the state of the task restarting a 32-bit syscall.
-		 * Make sure we interpret the -ERESTART* codes correctly
-		 * in case the task is not actually still sitting at the
-		 * exit from a 32-bit syscall with TS_COMPAT still set.
+		 * Warning: bizarre corner case fixup here.  A 32-bit
+		 * debugger setting orig_eax to -1 wants to disable
+		 * syscall restart.  Make sure that the syscall
+		 * restart code sign-extends orig_ax.  Also make sure
+		 * we interpret the -ERESTART* codes correctly if
+		 * loaded into regs->ax in case the task is not
+		 * actually still sitting at the exit from a 32-bit
+		 * syscall with TS_COMPAT still set.
 		 */
 		regs->orig_ax = value;
 		if (syscall_get_nr(child, regs) >= 0)
-			task_thread_info(child)->status |= TS_COMPAT;
+			task_thread_info(child)->status |= TS_I386_REGS_POKED;
 		break;
 
 	case offsetof(struct user32, regs.eflags):
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 22cc2f9f8aec..9747a6358949 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -760,8 +760,30 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 
 static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 {
-#ifdef CONFIG_X86_64
-	if (in_ia32_syscall())
+	/*
+	 * This function is fundamentally broken as currently
+	 * implemented.
+	 *
+	 * The idea is that we want to trigger a call to the
+	 * restart_block() syscall and that we want in_ia32_syscall(),
+	 * in_x32_syscall(), etc. to match whatever they were in the
+	 * syscall being restarted.  We assume that the syscall
+	 * instruction at (regs->ip - 2) matches whatever syscall
+	 * instruction we used to enter in the first place.
+	 *
+	 * The problem is that we can get here when ptrace pokes
+	 * syscall-like values into regs even if we're not in a syscall
+	 * at all.
+	 *
+	 * For now, we maintain historical behavior and guess based on
+	 * stored state.  We could do better by saving the actual
+	 * syscall arch in restart_block or (with caveats on x32) by
+	 * checking if regs->ip points to 'int $0x80'.  The current
+	 * behavior is incorrect if a tracer has a different bitness
+	 * than the tracee.
+	 */
+#ifdef CONFIG_IA32_EMULATION
+	if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
 		return __NR_ia32_restart_syscall;
 #endif
 #ifdef CONFIG_X86_X32_ABI
-- 
cgit v1.2.3-55-g7522


From f7d665627e103e82d34306c7d3f6f46f387c0d8b Mon Sep 17 00:00:00 2001
From: David Howells
Date: Wed, 27 Jul 2016 11:42:38 +0100
Subject: x86/syscalls/64: Add compat_sys_keyctl for 32-bit userspace

x86_64 needs to use compat_sys_keyctl for 32-bit userspace rather than
calling sys_keyctl(). The latter will work in a lot of cases, thereby
hiding the issue.

Reported-by: Stephan Mueller <smueller@chronox.de>
Tested-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: keyrings@vger.kernel.org
Cc: linux-security-module@vger.kernel.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/146961615805.14395.5581949237156769439.stgit@warthog.procyon.org.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4cddd17153fb..f848572169ea 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -294,7 +294,7 @@
 # 285 sys_setaltroot
 286	i386	add_key			sys_add_key
 287	i386	request_key		sys_request_key
-288	i386	keyctl			sys_keyctl
+288	i386	keyctl			sys_keyctl			compat_sys_keyctl
 289	i386	ioprio_set		sys_ioprio_set
 290	i386	ioprio_get		sys_ioprio_get
 291	i386	inotify_init		sys_inotify_init
-- 
cgit v1.2.3-55-g7522


From 3aed64f6d341cdb62bb2d6232589fb13448ce063 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Thu, 9 Jun 2016 13:06:08 +0200
Subject: pvclock: introduce seqcount-like API

The version field in struct pvclock_vcpu_time_info basically implements
a seqcount.  Wrap it with the usual read_begin and read_retry functions,
and use these APIs instead of peppering the code with smp_rmb()s.
While at it, change it to the more pedantically correct virt_rmb().

With this change, __pvclock_read_cycles can be simplified noticeably.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/entry/vdso/vclock_gettime.c |  9 ++-------
 arch/x86/include/asm/pvclock.h       | 39 +++++++++++++++++++++---------------
 arch/x86/kernel/pvclock.c            | 17 ++++++----------
 3 files changed, 31 insertions(+), 34 deletions(-)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 2f02d23a05ef..db1e3b4c3693 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -123,9 +123,7 @@ static notrace cycle_t vread_pvclock(int *mode)
 	 */
 
 	do {
-		version = pvti->version;
-
-		smp_rmb();
+		version = pvclock_read_begin(pvti);
 
 		if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
 			*mode = VCLOCK_NONE;
@@ -137,10 +135,7 @@ static notrace cycle_t vread_pvclock(int *mode)
 		pvti_tsc_shift = pvti->tsc_shift;
 		pvti_system_time = pvti->system_time;
 		pvti_tsc = pvti->tsc_timestamp;
-
-		/* Make sure that the version double-check is last. */
-		smp_rmb();
-	} while (unlikely((version & 1) || version != pvti->version));
+	} while (pvclock_read_retry(pvti, version));
 
 	delta = tsc - pvti_tsc;
 	ret = pvti_system_time +
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7c1c89598688..d019f0cc80ec 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -25,6 +25,24 @@ void pvclock_resume(void);
 
 void pvclock_touch_watchdogs(void);
 
+static __always_inline
+unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
+{
+	unsigned version = src->version & ~1;
+	/* Make sure that the version is read before the data. */
+	virt_rmb();
+	return version;
+}
+
+static __always_inline
+bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
+			unsigned version)
+{
+	/* Make sure that the version is re-read after the data. */
+	virt_rmb();
+	return unlikely(version != src->version);
+}
+
 /*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
@@ -69,23 +87,12 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 }
 
 static __always_inline
-unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
-			       cycle_t *cycles, u8 *flags)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
 {
-	unsigned version;
-	cycle_t offset;
-	u64 delta;
-
-	version = src->version;
-	/* Make the latest version visible */
-	smp_rmb();
-
-	delta = rdtsc_ordered() - src->tsc_timestamp;
-	offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
-				   src->tsc_shift);
-	*cycles = src->system_time + offset;
-	*flags = src->flags;
-	return version;
+	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
+					     src->tsc_shift);
+	return src->system_time + offset;
 }
 
 struct pvclock_vsyscall_time_info {
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 06c58ce46762..3599404e3089 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -64,14 +64,9 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
 	u8 flags;
 
 	do {
-		version = src->version;
-		/* Make the latest version visible */
-		smp_rmb();
-
+		version = pvclock_read_begin(src);
 		flags = src->flags;
-		/* Make sure that the version double-check is last. */
-		smp_rmb();
-	} while ((src->version & 1) || version != src->version);
+	} while (pvclock_read_retry(src, version));
 
 	return flags & valid_flags;
 }
@@ -84,10 +79,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 	u8 flags;
 
 	do {
-		version = __pvclock_read_cycles(src, &ret, &flags);
-		/* Make sure that the version double-check is last. */
-		smp_rmb();
-	} while ((src->version & 1) || version != src->version);
+		version = pvclock_read_begin(src);
+		ret = __pvclock_read_cycles(src);
+		flags = src->flags;
+	} while (pvclock_read_retry(src, version));
 
 	if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
 		src->flags &= ~PVCLOCK_GUEST_STOPPED;
-- 
cgit v1.2.3-55-g7522


From abe9efa79be02cf2ba27f643b214b07877bb050b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Thu, 9 Jun 2016 20:12:34 +0200
Subject: x86: vdso: use __pvclock_read_cycles

The new simplified __pvclock_read_cycles does the same computation
as vread_pvclock, except that (because it takes the pvclock_vcpu_time_info
pointer) it has to be moved inside the loop.  Since the loop is expected to
never roll, this makes no difference.

Acked-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/entry/vdso/vclock_gettime.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index db1e3b4c3693..94d54d0defa7 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -96,9 +96,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 {
 	const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
 	cycle_t ret;
-	u64 tsc, pvti_tsc;
-	u64 last, delta, pvti_system_time;
-	u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
+	u64 last;
+	u32 version;
 
 	/*
 	 * Note: The kernel and hypervisor must guarantee that cpu ID
@@ -130,18 +129,9 @@ static notrace cycle_t vread_pvclock(int *mode)
 			return 0;
 		}
 
-		tsc = rdtsc_ordered();
-		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
-		pvti_tsc_shift = pvti->tsc_shift;
-		pvti_system_time = pvti->system_time;
-		pvti_tsc = pvti->tsc_timestamp;
+		ret = __pvclock_read_cycles(pvti);
 	} while (pvclock_read_retry(pvti, version));
 
-	delta = tsc - pvti_tsc;
-	ret = pvti_system_time +
-		pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
-				    pvti_tsc_shift);
-
 	/* refer to vread_tsc() comment for rationale */
 	last = gtod->cycle_last;
 
-- 
cgit v1.2.3-55-g7522


From 469f00231278da68062a809306df0bac95a27507 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko
Date: Fri, 15 Jul 2016 11:42:43 +0200
Subject: x86, kasan, ftrace: Put APIC interrupt handlers into .irqentry.text

Dmitry Vyukov has reported unexpected KASAN stackdepot growth:

  https://github.com/google/kasan/issues/36

... which is caused by the APIC handlers not being present in .irqentry.text:

When building with CONFIG_FUNCTION_GRAPH_TRACER=y or CONFIG_KASAN=y, put the
APIC interrupt handlers into the .irqentry.text section. This is needed
because both KASAN and function graph tracer use __irqentry_text_start and
__irqentry_text_end to determine whether a function is an IRQ entry point.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: aryabinin@virtuozzo.com
Cc: kasan-dev@googlegroups.com
Cc: kcc@google.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1468575763-144889-1-git-send-email-glider@google.com
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index b846875aeea6..9f85827db24e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -601,9 +601,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 .endm
 #endif
 
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+# define POP_SECTION_IRQENTRY	.popsection
+#else
+# define PUSH_SECTION_IRQENTRY
+# define POP_SECTION_IRQENTRY
+#endif
+
 .macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
 apicinterrupt3 \num \sym \do_sym
 trace_apicinterrupt \num \sym
+POP_SECTION_IRQENTRY
 .endm
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-55-g7522


From 3e035305875cfa8a58c1ca573d0cfa6a7f201f27 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 3 Aug 2016 19:14:29 +0200
Subject: x86/entry: Clarify the RF saving/restoring situation with
 SYSCALL/SYSRET

Clarify why exactly RF cannot be restored properly by SYSRET to avoid
confusion.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160803171429.GA2590@nazgul.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9f85827db24e..d172c619c449 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -288,11 +288,15 @@ return_from_SYSCALL_64:
 	jne	opportunistic_sysret_failed
 
 	/*
-	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
-	 * restoring TF results in a trap from userspace immediately after
-	 * SYSRET.  This would cause an infinite loop whenever #DB happens
-	 * with register state that satisfies the opportunistic SYSRET
-	 * conditions.  For example, single-stepping this user code:
+	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+	 * restore RF properly. If the slowpath sets it for whatever reason, we
+	 * need to restore it correctly.
+	 *
+	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+	 * trap from userspace immediately after SYSRET.  This would cause an
+	 * infinite loop whenever #DB happens with register state that satisfies
+	 * the opportunistic SYSRET conditions.  For example, single-stepping
+	 * this user code:
 	 *
 	 *           movq	$stuck_here, %rcx
 	 *           pushfq
-- 
cgit v1.2.3-55-g7522


From 5e44258d168b2bdee51d9e2e1f1f4726ff9775cd Mon Sep 17 00:00:00 2001
From: Valdis Kletnieks
Date: Sun, 31 Jul 2016 23:24:50 -0400
Subject: x86/build: Reduce the W=1 warnings noise when compiling x86 syscall
 tables

Building an X86_64 kernel with W=1 throws a total of 9,948 lines of warnings of
this form for both 32-bit and 64-bit syscall tables. Given that the entire rest
of the build for my config only generates 8,375 lines of output, this is a big
reduction in the warnings generated.

The warnings follow this pattern:

  ./arch/x86/include/generated/asm/syscalls_32.h:885:21: warning: initialized field overwritten [-Woverride-init]
   __SYSCALL_I386(379, compat_sys_pwritev2, )
                     ^
  arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386'
   #define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
                                              ^~~
  ./arch/x86/include/generated/asm/syscalls_32.h:885:21: note: (near initialization for 'ia32_sys_call_table[379]')
   __SYSCALL_I386(379, compat_sys_pwritev2, )
                     ^
  arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386'
   #define __SYSCALL_I386(nr, sym, qual) [nr] = sym,

Since we intentionally build the syscall tables this way, ignore that one
warning in the two files.

Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/7464.1470021890@turing-police.cc.vt.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/entry')

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index fe91c25092da..77f28ce9c646 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -5,6 +5,8 @@
 OBJECT_FILES_NON_STANDARD_entry_$(BITS).o   := y
 OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
 
+CFLAGS_syscall_64.o		+= -Wno-override-init
+CFLAGS_syscall_32.o		+= -Wno-override-init
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
 obj-y				+= common.o
 
-- 
cgit v1.2.3-55-g7522