x86/asm/entry/64: Move opportunistic sysret code to syscall code path
This change does two things: Copy-pastes "retint_swapgs:" code into syscall handling code, the copy is under "syscall_return:" label. The code is unchanged apart from some label renames. Removes "opportunistic sysret" code from "retint_swapgs:" code block, since now it won't be reached by syscall return. This in fact removes most of the code in question. text data bss dec hex filename 12530 0 0 12530 30f2 entry_64.o.before 12562 0 0 12562 3112 entry_64.o Run-tested. Acked-and-Tested-by: Borislav Petkov <bp@suse.de> Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> Cc: Alexei Starovoitov <ast@plumgrid.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Will Drewry <wad@chromium.org> Link: http://lkml.kernel.org/r/1427993219-7291-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
4bcc7827b0
commit
fffbb5dcfd
1 changed files with 86 additions and 72 deletions
|
@ -355,7 +355,7 @@ GLOBAL(int_with_check)
|
||||||
andl %edi,%edx
|
andl %edi,%edx
|
||||||
jnz int_careful
|
jnz int_careful
|
||||||
andl $~TS_COMPAT,TI_status(%rcx)
|
andl $~TS_COMPAT,TI_status(%rcx)
|
||||||
jmp retint_swapgs
|
jmp syscall_return
|
||||||
|
|
||||||
/* Either reschedule or signal or syscall exit tracking needed. */
|
/* Either reschedule or signal or syscall exit tracking needed. */
|
||||||
/* First do a reschedule test. */
|
/* First do a reschedule test. */
|
||||||
|
@ -399,9 +399,86 @@ int_restore_rest:
|
||||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||||
TRACE_IRQS_OFF
|
TRACE_IRQS_OFF
|
||||||
jmp int_with_check
|
jmp int_with_check
|
||||||
|
|
||||||
|
syscall_return:
|
||||||
|
/* The IRETQ could re-enable interrupts: */
|
||||||
|
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||||
|
TRACE_IRQS_IRETQ
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Try to use SYSRET instead of IRET if we're returning to
|
||||||
|
* a completely clean 64-bit userspace context.
|
||||||
|
*/
|
||||||
|
movq RCX(%rsp),%rcx
|
||||||
|
cmpq %rcx,RIP(%rsp) /* RCX == RIP */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
|
||||||
|
* in kernel space. This essentially lets the user take over
|
||||||
|
* the kernel, since userspace controls RSP. It's not worth
|
||||||
|
* testing for canonicalness exactly -- this check detects any
|
||||||
|
* of the 17 high bits set, which is true for non-canonical
|
||||||
|
* or kernel addresses. (This will pessimize vsyscall=native.
|
||||||
|
* Big deal.)
|
||||||
|
*
|
||||||
|
* If virtual addresses ever become wider, this will need
|
||||||
|
* to be updated to remain correct on both old and new CPUs.
|
||||||
|
*/
|
||||||
|
.ifne __VIRTUAL_MASK_SHIFT - 47
|
||||||
|
.error "virtual address width changed -- SYSRET checks need update"
|
||||||
|
.endif
|
||||||
|
shr $__VIRTUAL_MASK_SHIFT, %rcx
|
||||||
|
jnz opportunistic_sysret_failed
|
||||||
|
|
||||||
|
cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
movq R11(%rsp),%r11
|
||||||
|
cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
|
||||||
|
* restoring TF results in a trap from userspace immediately after
|
||||||
|
* SYSRET. This would cause an infinite loop whenever #DB happens
|
||||||
|
* with register state that satisfies the opportunistic SYSRET
|
||||||
|
* conditions. For example, single-stepping this user code:
|
||||||
|
*
|
||||||
|
* movq $stuck_here,%rcx
|
||||||
|
* pushfq
|
||||||
|
* popq %r11
|
||||||
|
* stuck_here:
|
||||||
|
*
|
||||||
|
* would never get past 'stuck_here'.
|
||||||
|
*/
|
||||||
|
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
|
||||||
|
jnz opportunistic_sysret_failed
|
||||||
|
|
||||||
|
/* nothing to check for RSP */
|
||||||
|
|
||||||
|
cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
|
||||||
|
jne opportunistic_sysret_failed
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We win! This label is here just for ease of understanding
|
||||||
|
* perf profiles. Nothing jumps here.
|
||||||
|
*/
|
||||||
|
syscall_return_via_sysret:
|
||||||
|
CFI_REMEMBER_STATE
|
||||||
|
/* r11 is already restored (see code above) */
|
||||||
|
RESTORE_C_REGS_EXCEPT_R11
|
||||||
|
movq RSP(%rsp),%rsp
|
||||||
|
USERGS_SYSRET64
|
||||||
|
CFI_RESTORE_STATE
|
||||||
|
|
||||||
|
opportunistic_sysret_failed:
|
||||||
|
SWAPGS
|
||||||
|
jmp restore_c_regs_and_iret
|
||||||
CFI_ENDPROC
|
CFI_ENDPROC
|
||||||
END(system_call)
|
END(system_call)
|
||||||
|
|
||||||
|
|
||||||
.macro FORK_LIKE func
|
.macro FORK_LIKE func
|
||||||
ENTRY(stub_\func)
|
ENTRY(stub_\func)
|
||||||
CFI_STARTPROC
|
CFI_STARTPROC
|
||||||
|
@ -673,76 +750,8 @@ retint_swapgs: /* return to user-space */
|
||||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||||
TRACE_IRQS_IRETQ
|
TRACE_IRQS_IRETQ
|
||||||
|
|
||||||
/*
|
|
||||||
* Try to use SYSRET instead of IRET if we're returning to
|
|
||||||
* a completely clean 64-bit userspace context.
|
|
||||||
*/
|
|
||||||
movq RCX(%rsp),%rcx
|
|
||||||
cmpq %rcx,RIP(%rsp) /* RCX == RIP */
|
|
||||||
jne opportunistic_sysret_failed
|
|
||||||
|
|
||||||
/*
|
|
||||||
* On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
|
|
||||||
* in kernel space. This essentially lets the user take over
|
|
||||||
* the kernel, since userspace controls RSP. It's not worth
|
|
||||||
* testing for canonicalness exactly -- this check detects any
|
|
||||||
* of the 17 high bits set, which is true for non-canonical
|
|
||||||
* or kernel addresses. (This will pessimize vsyscall=native.
|
|
||||||
* Big deal.)
|
|
||||||
*
|
|
||||||
* If virtual addresses ever become wider, this will need
|
|
||||||
* to be updated to remain correct on both old and new CPUs.
|
|
||||||
*/
|
|
||||||
.ifne __VIRTUAL_MASK_SHIFT - 47
|
|
||||||
.error "virtual address width changed -- sysret checks need update"
|
|
||||||
.endif
|
|
||||||
shr $__VIRTUAL_MASK_SHIFT, %rcx
|
|
||||||
jnz opportunistic_sysret_failed
|
|
||||||
|
|
||||||
cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
|
|
||||||
jne opportunistic_sysret_failed
|
|
||||||
|
|
||||||
movq R11(%rsp),%r11
|
|
||||||
cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
|
|
||||||
jne opportunistic_sysret_failed
|
|
||||||
|
|
||||||
/*
|
|
||||||
* SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
|
|
||||||
* restoring TF results in a trap from userspace immediately after
|
|
||||||
* SYSRET. This would cause an infinite loop whenever #DB happens
|
|
||||||
* with register state that satisfies the opportunistic SYSRET
|
|
||||||
* conditions. For example, single-stepping this user code:
|
|
||||||
*
|
|
||||||
* movq $stuck_here,%rcx
|
|
||||||
* pushfq
|
|
||||||
* popq %r11
|
|
||||||
* stuck_here:
|
|
||||||
*
|
|
||||||
* would never get past 'stuck_here'.
|
|
||||||
*/
|
|
||||||
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
|
|
||||||
jnz opportunistic_sysret_failed
|
|
||||||
|
|
||||||
/* nothing to check for RSP */
|
|
||||||
|
|
||||||
cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
|
|
||||||
jne opportunistic_sysret_failed
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We win! This label is here just for ease of understanding
|
|
||||||
* perf profiles. Nothing jumps here.
|
|
||||||
*/
|
|
||||||
irq_return_via_sysret:
|
|
||||||
CFI_REMEMBER_STATE
|
|
||||||
/* r11 is already restored (see code above) */
|
|
||||||
RESTORE_C_REGS_EXCEPT_R11
|
|
||||||
movq RSP(%rsp),%rsp
|
|
||||||
USERGS_SYSRET64
|
|
||||||
CFI_RESTORE_STATE
|
|
||||||
|
|
||||||
opportunistic_sysret_failed:
|
|
||||||
SWAPGS
|
SWAPGS
|
||||||
jmp restore_args
|
jmp restore_c_regs_and_iret
|
||||||
|
|
||||||
/* Returning to kernel space */
|
/* Returning to kernel space */
|
||||||
retint_kernel:
|
retint_kernel:
|
||||||
|
@ -761,7 +770,12 @@ retint_kernel:
|
||||||
* The iretq could re-enable interrupts:
|
* The iretq could re-enable interrupts:
|
||||||
*/
|
*/
|
||||||
TRACE_IRQS_IRETQ
|
TRACE_IRQS_IRETQ
|
||||||
restore_args:
|
|
||||||
|
/*
|
||||||
|
* At this label, code paths which return to kernel and to user,
|
||||||
|
* which come from interrupts/exception and from syscalls, merge.
|
||||||
|
*/
|
||||||
|
restore_c_regs_and_iret:
|
||||||
RESTORE_C_REGS
|
RESTORE_C_REGS
|
||||||
REMOVE_PT_GPREGS_FROM_STACK 8
|
REMOVE_PT_GPREGS_FROM_STACK 8
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue