diff options
Diffstat (limited to 'arch/x86/entry/entry_64.S')
-rw-r--r-- | arch/x86/entry/entry_64.S | 408 |
1 files changed, 315 insertions, 93 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index bcfc5668dcb2..a83570495162 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -23,7 +23,6 @@ #include <asm/segment.h> #include <asm/cache.h> #include <asm/errno.h> -#include "calling.h" #include <asm/asm-offsets.h> #include <asm/msr.h> #include <asm/unistd.h> @@ -38,8 +37,11 @@ #include <asm/pgtable_types.h> #include <asm/export.h> #include <asm/frame.h> +#include <asm/nospec-branch.h> #include <linux/err.h> +#include "calling.h" + .code64 .section .entry.text, "ax" @@ -51,15 +53,19 @@ ENTRY(native_usergs_sysret64) END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_IRETQ +.macro TRACE_IRQS_FLAGS flags:req #ifdef CONFIG_TRACE_IRQFLAGS - bt $9, EFLAGS(%rsp) /* interrupts off? */ + bt $9, \flags /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: #endif .endm +.macro TRACE_IRQS_IRETQ + TRACE_IRQS_FLAGS EFLAGS(%rsp) +.endm + /* * When dynamic function tracer is enabled it will add a breakpoint * to all locations that it is about to modify, sync CPUs, update @@ -136,6 +142,67 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ + .pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline. This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address). So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * _entry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline. We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ + SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY + swapgs + + /* Stash the user RSP. */ + movq %rsp, RSP_SCRATCH + + /* Note: using %rsp as a scratch reg. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Load the top of the task stack into RSP */ + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + + /* Start building the simulated IRET frame. */ + pushq $__USER_DS /* pt_regs->ss */ + pushq RSP_SCRATCH /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + + /* + * x86 lacks a near absolute jump, and we can't jump to the real + * entry text with a relative jump. We could push the target + * address and then use retq, but this destroys the pipeline on + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, + * spill RDI and restore it in a second-stage trampoline. + */ + pushq %rdi + movq $entry_SYSCALL_64_stage2, %rdi + JMP_NOSPEC %rdi +END(entry_SYSCALL_64_trampoline) + + .popsection + +ENTRY(entry_SYSCALL_64_stage2) + UNWIND_HINT_EMPTY + popq %rdi + jmp entry_SYSCALL_64_after_hwframe +END(entry_SYSCALL_64_stage2) + ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* @@ -145,11 +212,13 @@ ENTRY(entry_SYSCALL_64) */ swapgs + /* + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it + * is not required to switch CR3. + */ movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - TRACE_IRQS_OFF - /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ @@ -170,6 +239,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ UNWIND_HINT_REGS extra=0 + TRACE_IRQS_OFF + /* * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. @@ -200,7 +271,12 @@ entry_SYSCALL_64_fastpath: * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif .Lentry_SYSCALL_64_after_fastpath_call: movq %rax, RAX(%rsp) @@ -221,10 +297,9 @@ entry_SYSCALL_64_fastpath: TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp + addq $6*8, %rsp /* skip extra regs -- they were preserved */ UNWIND_HINT_EMPTY - USERGS_SYSRET64 + jmp .Lpop_c_regs_except_rcx_r11_and_sysret 1: /* @@ -246,17 +321,18 @@ entry_SYSCALL64_slow_path: call do_syscall_64 /* returns with IRQs disabled */ return_from_SYSCALL_64: - RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ /* * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. + * a completely clean 64-bit userspace context. If we're not, + * go to the slow exit path. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 - cmpq %rcx, %r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + + cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ + jne swapgs_restore_regs_and_return_to_usermode /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -274,14 +350,14 @@ return_from_SYSCALL_64: /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode movq R11(%rsp), %r11 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot @@ -302,12 +378,12 @@ return_from_SYSCALL_64: * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + jnz swapgs_restore_regs_and_return_to_usermode /* nothing to check for RSP */ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding @@ -315,14 +391,37 @@ return_from_SYSCALL_64: */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY - USERGS_SYSRET64 + POP_EXTRA_REGS +.Lpop_c_regs_except_rcx_r11_and_sysret: + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi -opportunistic_sysret_failed: - SWAPGS - jmp restore_c_regs_and_iret + /* + * Now all regs are restored except RSP and RDI. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + popq %rdi + popq %rsp + USERGS_SYSRET64 END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -349,7 +448,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - jmp *%rax /* Called from C */ + JMP_NOSPEC %rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -392,6 +491,18 @@ ENTRY(__switch_to_asm) movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + /* Clobbers %rbx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popq %r15 popq %r14 @@ -423,13 +534,12 @@ ENTRY(ret_from_fork) movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ movq %r12, %rdi - call *%rbx + CALL_NOSPEC %rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -457,12 +567,13 @@ END(irq_entries_start) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY - pushfq - testl $X86_EFLAGS_IF, (%rsp) + pushq %rax + SAVE_FLAGS(CLBR_RAX) + testl $X86_EFLAGS_IF, %eax jz .Lokay_\@ ud2 .Lokay_\@: - addq $8, %rsp + popq %rax #endif .endm @@ -554,6 +665,13 @@ END(irq_entries_start) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld + + testb $3, CS-ORIG_RAX(%rsp) + jz 1f + SWAPGS + call switch_to_thread_stack +1: + ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS @@ -563,12 +681,8 @@ END(irq_entries_start) jz 1f /* - * IRQ from user mode. Switch to kernel gsbase and inform context - * tracking that we're in kernel mode. - */ - SWAPGS - - /* + * IRQ from user mode. + * * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode * (which can take locks). Since TRACE_IRQS_OFF idempotent, @@ -612,8 +726,54 @@ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ + +GLOBAL(swapgs_restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testb $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif + POP_EXTRA_REGS + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + + /* + * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ + pushq 5*8(%rdi) /* RSP */ + pushq 4*8(%rdi) /* EFLAGS */ + pushq 3*8(%rdi) /* CS */ + pushq 2*8(%rdi) /* RIP */ + + /* Push user RDI on the trampoline stack. */ + pushq (%rdi) + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + /* Restore RDI. */ + popq %rdi SWAPGS - jmp restore_regs_and_iret + INTERRUPT_RETURN + /* Returning to kernel space */ retint_kernel: @@ -633,15 +793,17 @@ retint_kernel: */ TRACE_IRQS_IRETQ -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -GLOBAL(restore_regs_and_iret) - RESTORE_EXTRA_REGS -restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testb $3, CS(%rsp) + jz 1f + ud2 +1: +#endif + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN ENTRY(native_iret) @@ -689,7 +851,9 @@ native_irq_return_ldt: */ pushq %rdi /* Stash user RDI */ - SWAPGS + SWAPGS /* to kernel GS */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ + movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ movq (1*8)(%rsp), %rax /* user RIP */ @@ -705,7 +869,6 @@ native_irq_return_ldt: /* Now RAX == RSP. */ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ - popq %rdi /* Restore user RDI */ /* * espfix_stack[31:16] == 0. The page tables are set up such that @@ -716,7 +879,11 @@ native_irq_return_ldt: * still points to an RO alias of the ESPFIX stack. */ orq PER_CPU_VAR(espfix_stack), %rax - SWAPGS + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + SWAPGS /* to user GS */ + popq %rdi /* Restore user RDI */ + movq %rax, %rsp UNWIND_HINT_IRET_REGS offset=8 @@ -805,7 +972,35 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + +/* + * Switch to the thread stack. This is called with the IRET frame and + * orig_ax on the stack. (That is, RDI..R12 are not on the stack and + * space has not been allocated for them.) + */ +ENTRY(switch_to_thread_stack) + UNWIND_HINT_FUNC + + pushq %rdi + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI + + pushq 7*8(%rdi) /* regs->ss */ + pushq 6*8(%rdi) /* regs->rsp */ + pushq 5*8(%rdi) /* regs->eflags */ + pushq 4*8(%rdi) /* regs->cs */ + pushq 3*8(%rdi) /* regs->ip */ + pushq 2*8(%rdi) /* regs->orig_ax */ + pushq 8(%rdi) /* return address */ + UNWIND_HINT_FUNC + + movq (%rdi), %rdi + ret +END(switch_to_thread_stack) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) @@ -818,17 +1013,18 @@ ENTRY(\sym) ASM_CLAC - .ifeq \has_error_code + .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK - .if \paranoid - .if \paranoid == 1 + .if \paranoid < 2 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ - jnz 1f + jnz .Lfrom_usermode_switch_stack_\@ .endif + + .if \paranoid call paranoid_entry .else call error_entry @@ -870,20 +1066,15 @@ ENTRY(\sym) jmp error_exit .endif - .if \paranoid == 1 + .if \paranoid < 2 /* - * Paranoid entry from userspace. Switch stacks and treat it + * Entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers * run in real process context if user_mode(regs). */ -1: +.Lfrom_usermode_switch_stack_\@: call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - call sync_regs - movq %rax, %rsp /* switch stack */ - movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code @@ -923,11 +1114,13 @@ ENTRY(native_load_gs_index) FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + TRACE_IRQS_OFF SWAPGS .Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS + TRACE_IRQS_FLAGS (%rsp) popfq FRAME_END ret @@ -1059,6 +1252,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN +idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 #endif @@ -1071,7 +1265,7 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* @@ -1092,7 +1286,11 @@ ENTRY(paranoid_entry) js 1f /* negative -> in kernel */ SWAPGS xorl %ebx, %ebx -1: ret + +1: + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + + ret END(paranoid_entry) /* @@ -1112,17 +1310,15 @@ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1144,8 +1340,18 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ + ENCODE_FRAME_POINTER + pushq %r12 + /* * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode @@ -1182,6 +1388,7 @@ ENTRY(error_entry) * .Lgs_change's error handler with kernel gsbase. */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done .Lbstep_iret: @@ -1191,10 +1398,11 @@ ENTRY(error_entry) .Lerror_bad_iret: /* - * We came from an IRET to user mode, so we have user gsbase. - * Switch to kernel gsbase: + * We came from an IRET to user mode, so we have user + * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* * Pretend that the exception came from user mode: set up pt_regs @@ -1223,10 +1431,17 @@ ENTRY(error_exit) jmp retint_user END(error_exit) -/* Runs on exception stack */ -/* XXX: broken on Xen PV */ +/* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. + * + * Registers: + * %r14: Used to save/restore the CR3 of the interrupted context + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + */ ENTRY(nmi) UNWIND_HINT_IRET_REGS + /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1284,8 +1499,9 @@ ENTRY(nmi) * stacks lest we corrupt the "NMI executing" variable. */ - SWAPGS_UNSAFE_STACK + swapgs cld + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 @@ -1328,8 +1544,7 @@ ENTRY(nmi) * Return back to user mode. We must *not* do the normal exit * work, because we don't want to enable interrupts. */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* @@ -1450,7 +1665,7 @@ nested_nmi_out: popq %rdx /* We are returning to kernel mode, so this cannot result in a fault. */ - INTERRUPT_RETURN + iretq first_nmi: /* Restore rdx. */ @@ -1481,7 +1696,7 @@ first_nmi: pushfq /* RFLAGS */ pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ - INTERRUPT_RETURN /* continues at repeat_nmi below */ + iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: #endif @@ -1539,34 +1754,41 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS + POP_EXTRA_REGS + POP_C_REGS - /* Point RSP at the "iret" frame. */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 + /* + * Skip orig_ax and the "outermost" frame to point RSP at the "iret" + * at the "iret" frame. + */ + addq $6*8, %rsp /* * Clear "NMI executing". Set DF first so that we can easily * distinguish the remaining code between here and IRET from - * the SYSCALL entry and exit paths. On a native kernel, we - * could just inspect RIP, but, on paravirt kernels, - * INTERRUPT_RETURN can translate into a jump into a - * hypercall page. + * the SYSCALL entry and exit paths. + * + * We arguably should just inspect RIP instead, but I (Andy) wrote + * this code when I had the misapprehension that Xen PV supported + * NMIs, and Xen PV would break that approach. */ std movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI - * stack in a single instruction. We are returning to kernel - * mode, so this cannot result in a fault. + * iretq reads the "iret" frame and exits the NMI stack in a + * single instruction. We are returning to kernel mode, so this + * cannot result in a fault. Similarly, we don't need to worry + * about espfix64 on the way back to kernel mode. */ - INTERRUPT_RETURN + iretq END(nmi) ENTRY(ignore_sysret) |