kernel/x86_64: save fpu state at interrupts
The kernel is allowed to use fpu anywhere so we must make sure that user state is not clobbered by saving fpu state at interrupt entry. There is no need to do that in case of system calls since all fpu data registers are caller saved. We do not need, though, to save the whole fpu state at task swich (again, thanks to calling convention). Only status and control registers are preserved. This patch actually adds xmm0-15 register to clobber list of task swich code, but the only reason of that is to make sure that nothing bad happens inside the function that executes that task swich. Inspection of the generated code shows that no xmm registers are actually saved. Signed-off-by: Paweł Dziepak <pdziepak@quarnos.org>
This commit is contained in:
parent
b41f281071
commit
396b74228e
@ -28,6 +28,10 @@ x86_write_msr(uint32_t msr, uint64_t value)
|
||||
static inline void
|
||||
x86_context_switch(arch_thread* oldState, arch_thread* newState)
|
||||
{
|
||||
uint16_t fpuControl;
|
||||
asm volatile("fnstcw %0" : "=m" (fpuControl));
|
||||
uint32_t sseControl;
|
||||
asm volatile("stmxcsr %0" : "=m" (sseControl));
|
||||
asm volatile(
|
||||
"pushq %%rbp;"
|
||||
"movq $1f, %c[rip](%0);"
|
||||
@ -41,7 +45,11 @@ x86_context_switch(arch_thread* oldState, arch_thread* newState)
|
||||
[rsp] "i" (offsetof(arch_thread, current_stack)),
|
||||
[rip] "i" (offsetof(arch_thread, instruction_pointer))
|
||||
: "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13",
|
||||
"r14", "r15", "memory");
|
||||
"r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||
"xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
|
||||
"xmm14", "xmm15", "memory");
|
||||
asm volatile("ldmxcsr %0" : : "m" (sseControl));
|
||||
asm volatile("fldcw %0" : : "m" (fpuControl));
|
||||
}
|
||||
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
struct iframe {
|
||||
uint64 type;
|
||||
void* fpu;
|
||||
uint64 r15;
|
||||
uint64 r14;
|
||||
uint64 r13;
|
||||
|
@ -454,10 +454,7 @@ void __x86_setup_system_time(uint32 conversionFactor,
|
||||
|
||||
void x86_userspace_thread_exit(void);
|
||||
void x86_end_userspace_thread_exit(void);
|
||||
void x86_fxsave(void* fpuState);
|
||||
void x86_fxrstor(const void* fpuState);
|
||||
void x86_noop_swap(void* oldFpuState, const void* newFpuState);
|
||||
void x86_fxsave_swap(void* oldFpuState, const void* newFpuState);
|
||||
|
||||
addr_t x86_get_stack_frame();
|
||||
uint32 x86_count_mtrrs(void);
|
||||
void x86_set_mtrr(uint32 index, uint64 base, uint64 length, uint8 type);
|
||||
@ -488,7 +485,13 @@ void x86_context_switch(struct arch_thread* oldState,
|
||||
|
||||
void x86_fnsave(void* fpuState);
|
||||
void x86_frstor(const void* fpuState);
|
||||
|
||||
void x86_fxsave(void* fpuState);
|
||||
void x86_fxrstor(const void* fpuState);
|
||||
|
||||
void x86_noop_swap(void* oldFpuState, const void* newFpuState);
|
||||
void x86_fnsave_swap(void* oldFpuState, const void* newFpuState);
|
||||
void x86_fxsave_swap(void* oldFpuState, const void* newFpuState);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -19,35 +19,6 @@
|
||||
.text
|
||||
|
||||
|
||||
/* void x86_fxsave(void* fpuState); */
|
||||
FUNCTION(x86_fxsave):
|
||||
fxsave (%rdi)
|
||||
ret
|
||||
FUNCTION_END(x86_fxsave)
|
||||
|
||||
|
||||
/* void x86_fxrstor(const void* fpuState); */
|
||||
FUNCTION(x86_fxrstor):
|
||||
fxrstor (%rdi)
|
||||
ret
|
||||
FUNCTION_END(x86_fxrstor)
|
||||
|
||||
|
||||
/* void x86_noop_swap(void *oldFpuState, const void *newFpuState); */
|
||||
FUNCTION(x86_noop_swap):
|
||||
nop
|
||||
ret
|
||||
FUNCTION_END(x86_noop_swap)
|
||||
|
||||
|
||||
/* void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); */
|
||||
FUNCTION(x86_fxsave_swap):
|
||||
fxsave (%rdi)
|
||||
fxrstor (%rsi)
|
||||
ret
|
||||
FUNCTION_END(x86_fxsave_swap)
|
||||
|
||||
|
||||
/* addr_t x86_get_stack_frame(); */
|
||||
FUNCTION(x86_get_stack_frame):
|
||||
mov %rbp, %rax
|
||||
|
@ -35,11 +35,13 @@
|
||||
push %r13; \
|
||||
push %r14; \
|
||||
push %r15; \
|
||||
pushq $0; \
|
||||
push $iframeType;
|
||||
|
||||
|
||||
// Restore the interrupt frame.
|
||||
#define RESTORE_IFRAME() \
|
||||
add $8, %rsp; \
|
||||
add $16, %rsp; \
|
||||
pop %r15; \
|
||||
pop %r14; \
|
||||
pop %r13; \
|
||||
@ -198,11 +200,18 @@ STATIC_FUNCTION(int_bottom):
|
||||
// exception.
|
||||
orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp)
|
||||
|
||||
subq $512, %rsp
|
||||
andq $~15, %rsp
|
||||
fxsaveq (%rsp)
|
||||
|
||||
// Call the interrupt handler.
|
||||
movq %rsp, %rdi
|
||||
movq IFRAME_vector(%rsp), %rax
|
||||
movq %rbp, %rdi
|
||||
movq IFRAME_vector(%rbp), %rax
|
||||
call *gInterruptHandlerTable(, %rax, 8)
|
||||
|
||||
fxrstorq (%rsp)
|
||||
movq %rbp, %rsp
|
||||
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
@ -217,12 +226,16 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
|
||||
// Push the rest of the interrupt frame to the stack.
|
||||
PUSH_IFRAME_BOTTOM(IFRAME_TYPE_OTHER)
|
||||
|
||||
cld
|
||||
|
||||
// Frame pointer is the iframe.
|
||||
movq %rsp, %rbp
|
||||
|
||||
subq $512, %rsp
|
||||
andq $~15, %rsp
|
||||
fxsaveq (%rsp)
|
||||
movq %rsp, IFRAME_fpu(%rbp)
|
||||
|
||||
// Set the RF (resume flag) in RFLAGS. This prevents an instruction
|
||||
// breakpoint on the instruction we're returning to to trigger a debug
|
||||
// exception.
|
||||
@ -235,8 +248,8 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
UPDATE_THREAD_USER_TIME()
|
||||
|
||||
// Call the interrupt handler.
|
||||
movq %rsp, %rdi
|
||||
movq IFRAME_vector(%rsp), %rax
|
||||
movq %rbp, %rdi
|
||||
movq IFRAME_vector(%rbp), %rax
|
||||
call *gInterruptHandlerTable(, %rax, 8)
|
||||
|
||||
// If there are no signals pending or we're not debugging, we can avoid
|
||||
@ -250,6 +263,9 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
|
||||
UPDATE_THREAD_KERNEL_TIME()
|
||||
|
||||
fxrstorq (%rsp)
|
||||
movq %rbp, %rsp
|
||||
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
@ -274,6 +290,9 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
movq %rbp, %rdi
|
||||
call x86_init_user_debug_at_kernel_exit
|
||||
1:
|
||||
fxrstorq (%rsp)
|
||||
movq %rbp, %rsp
|
||||
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
@ -395,7 +414,7 @@ FUNCTION(x86_64_syscall_entry):
|
||||
|
||||
// If we've just restored a signal frame, use the IRET path.
|
||||
cmpq $SYSCALL_RESTORE_SIGNAL_FRAME, %r14
|
||||
je .Liret
|
||||
je .Lrestore_fpu
|
||||
|
||||
// Restore the iframe and RCX/R11 for SYSRET.
|
||||
RESTORE_IFRAME()
|
||||
@ -466,7 +485,11 @@ FUNCTION(x86_64_syscall_entry):
|
||||
// On this return path it is possible that the frame has been modified,
|
||||
// for example to execute a signal handler. In this case it is safer to
|
||||
// return via IRET.
|
||||
jmp .Liret
|
||||
|
||||
.Lrestore_fpu:
|
||||
movq IFRAME_fpu(%rbp), %rax
|
||||
fxrstorq (%rax)
|
||||
.Liret:
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
@ -537,7 +560,7 @@ FUNCTION(x86_return_to_userland):
|
||||
testl $(THREAD_FLAGS_DEBUGGER_INSTALLED | THREAD_FLAGS_SIGNALS_PENDING \
|
||||
| THREAD_FLAGS_DEBUG_THREAD | THREAD_FLAGS_BREAKPOINTS_DEFINED) \
|
||||
, THREAD_flags(%r12)
|
||||
jnz .Lkernel_exit_work
|
||||
jnz .Luserland_return_work
|
||||
|
||||
// update the thread's kernel time and return
|
||||
UPDATE_THREAD_KERNEL_TIME()
|
||||
@ -546,4 +569,33 @@ FUNCTION(x86_return_to_userland):
|
||||
RESTORE_IFRAME()
|
||||
swapgs
|
||||
iretq
|
||||
.Luserland_return_work:
|
||||
// Slow path for return to userland.
|
||||
|
||||
// Do we need to handle signals?
|
||||
testl $(THREAD_FLAGS_SIGNALS_PENDING | THREAD_FLAGS_DEBUG_THREAD) \
|
||||
, THREAD_flags(%r12)
|
||||
jnz .Luserland_return_handle_signals
|
||||
cli
|
||||
call thread_at_kernel_exit_no_signals
|
||||
|
||||
.Luserland_return_work_done:
|
||||
// Install breakpoints, if defined.
|
||||
testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12)
|
||||
jz 1f
|
||||
movq %rbp, %rdi
|
||||
call x86_init_user_debug_at_kernel_exit
|
||||
1:
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
// Restore the previous GS base and return.
|
||||
swapgs
|
||||
iretq
|
||||
.Luserland_return_handle_signals:
|
||||
// thread_at_kernel_exit requires interrupts to be enabled, it will disable
|
||||
// them after.
|
||||
sti
|
||||
call thread_at_kernel_exit
|
||||
jmp .Luserland_return_work_done
|
||||
FUNCTION_END(x86_return_to_userland)
|
||||
|
@ -134,9 +134,12 @@ arch_thread_init(kernel_args* args)
|
||||
{
|
||||
// Save one global valid FPU state; it will be copied in the arch dependent
|
||||
// part of each new thread.
|
||||
asm volatile ("clts; fninit; fnclex;");
|
||||
x86_fxsave(sInitialState.fpu_state);
|
||||
|
||||
asm volatile (
|
||||
"clts;" \
|
||||
"fninit;" \
|
||||
"fnclex;" \
|
||||
"fxsave %0;"
|
||||
: "=m" (sInitialState.fpu_state));
|
||||
return B_OK;
|
||||
}
|
||||
|
||||
@ -296,15 +299,14 @@ arch_setup_signal_frame(Thread* thread, struct sigaction* action,
|
||||
signalFrameData->context.uc_mcontext.rip = frame->ip;
|
||||
signalFrameData->context.uc_mcontext.rflags = frame->flags;
|
||||
|
||||
// Store the FPU state. There appears to be a bug in GCC where the aligned
|
||||
// attribute on a structure is being ignored when the structure is allocated
|
||||
// on the stack, so even if the fpu_state struct has aligned(16) it may not
|
||||
// get aligned correctly. Instead, use the current thread's FPU save area
|
||||
// and then memcpy() to the frame structure.
|
||||
x86_fxsave(thread->arch_info.fpu_state);
|
||||
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
|
||||
thread->arch_info.fpu_state,
|
||||
sizeof(signalFrameData->context.uc_mcontext.fpu));
|
||||
if (frame->fpu != nullptr) {
|
||||
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, frame->fpu,
|
||||
sizeof(signalFrameData->context.uc_mcontext.fpu));
|
||||
} else {
|
||||
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
|
||||
sInitialState.fpu_state,
|
||||
sizeof(signalFrameData->context.uc_mcontext.fpu));
|
||||
}
|
||||
|
||||
// Fill in signalFrameData->context.uc_stack.
|
||||
signal_get_user_stack(frame->user_sp, &signalFrameData->context.uc_stack);
|
||||
@ -370,13 +372,12 @@ arch_restore_signal_frame(struct signal_frame_data* signalFrameData)
|
||||
frame->flags = (frame->flags & ~(uint64)X86_EFLAGS_USER_FLAGS)
|
||||
| (signalFrameData->context.uc_mcontext.rflags & X86_EFLAGS_USER_FLAGS);
|
||||
|
||||
// Same as above, alignment may not be correct. Copy to thread and restore
|
||||
// from there.
|
||||
Thread* thread = thread_get_current_thread();
|
||||
|
||||
memcpy(thread->arch_info.fpu_state,
|
||||
(void*)&signalFrameData->context.uc_mcontext.fpu,
|
||||
sizeof(thread->arch_info.fpu_state));
|
||||
x86_fxrstor(thread->arch_info.fpu_state);
|
||||
frame->fpu = &thread->arch_info.fpu_state;
|
||||
|
||||
// The syscall return code overwrites frame->ax with the return value of
|
||||
// the syscall, need to return it here to ensure the correct value is
|
||||
|
@ -82,8 +82,10 @@ extern "C" void x86_reboot(void);
|
||||
// from arch.S
|
||||
|
||||
void (*gCpuIdleFunc)(void);
|
||||
#ifndef __x86_64__
|
||||
void (*gX86SwapFPUFunc)(void* oldState, const void* newState) = x86_noop_swap;
|
||||
bool gHasSSE = false;
|
||||
#endif
|
||||
|
||||
static uint32 sCpuRendezvous;
|
||||
static uint32 sCpuRendezvous2;
|
||||
@ -318,13 +320,14 @@ x86_init_fpu(void)
|
||||
#endif
|
||||
|
||||
dprintf("%s: CPU has SSE... enabling FXSR and XMM.\n", __func__);
|
||||
|
||||
#ifndef __x86_64__
|
||||
// enable OS support for SSE
|
||||
x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION);
|
||||
x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU));
|
||||
|
||||
gX86SwapFPUFunc = x86_fxsave_swap;
|
||||
gHasSSE = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -31,7 +31,9 @@
|
||||
extern "C" void x86_return_to_userland(iframe* frame);
|
||||
|
||||
// from arch_cpu.cpp
|
||||
#ifndef __x86_64__
|
||||
extern void (*gX86SwapFPUFunc)(void *oldState, const void *newState);
|
||||
#endif
|
||||
|
||||
|
||||
static struct iframe*
|
||||
@ -245,7 +247,9 @@ arch_thread_context_switch(Thread* from, Thread* to)
|
||||
activePagingStructures->RemoveReference();
|
||||
}
|
||||
|
||||
#ifndef __x86_64__
|
||||
gX86SwapFPUFunc(from->arch_info.fpu_state, to->arch_info.fpu_state);
|
||||
#endif
|
||||
x86_context_switch(&from->arch_info, &to->arch_info);
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,9 @@
|
||||
// TODO: Make those real error codes.
|
||||
|
||||
|
||||
#ifndef __x86_64__
|
||||
extern bool gHasSSE;
|
||||
#endif
|
||||
|
||||
// The software breakpoint instruction (int3).
|
||||
const uint8 kX86SoftwareBreakpoint[1] = { 0xcc };
|
||||
@ -688,6 +690,12 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState)
|
||||
if (iframe* frame = x86_get_user_iframe()) {
|
||||
// For the floating point state to be correct the calling function must
|
||||
// not use these registers (not even indirectly).
|
||||
#ifdef __x86_64__
|
||||
Thread* thread = thread_get_current_thread();
|
||||
memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers,
|
||||
sizeof(cpuState->extended_registers));
|
||||
frame->fpu = &thread->arch_info.fpu_state;
|
||||
#else
|
||||
if (gHasSSE) {
|
||||
// Since fxrstor requires 16-byte alignment and this isn't
|
||||
// guaranteed passed buffer, we use our thread's fpu_state field as
|
||||
@ -698,12 +706,11 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState)
|
||||
memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers,
|
||||
sizeof(cpuState->extended_registers));
|
||||
x86_fxrstor(thread->arch_info.fpu_state);
|
||||
#ifndef __x86_64__
|
||||
} else {
|
||||
// TODO: Implement! We need to convert the format first.
|
||||
// x86_frstor(&cpuState->extended_registers);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
set_iframe_registers(frame, cpuState);
|
||||
}
|
||||
}
|
||||
@ -715,6 +722,15 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState)
|
||||
if (iframe* frame = x86_get_user_iframe()) {
|
||||
// For the floating point state to be correct the calling function must
|
||||
// not use these registers (not even indirectly).
|
||||
#ifdef __x86_64__
|
||||
if (frame->fpu != nullptr) {
|
||||
memcpy(&cpuState->extended_registers, frame->fpu,
|
||||
sizeof(cpuState->extended_registers));
|
||||
} else {
|
||||
memset(&cpuState->extended_registers, 0,
|
||||
sizeof(cpuState->extended_registers));
|
||||
}
|
||||
#else
|
||||
if (gHasSSE) {
|
||||
// Since fxsave requires 16-byte alignment and this isn't guaranteed
|
||||
// passed buffer, we use our thread's fpu_state field as temporary
|
||||
@ -725,15 +741,14 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState)
|
||||
// unlike fnsave, fxsave doesn't reinit the FPU state
|
||||
memcpy(&cpuState->extended_registers, thread->arch_info.fpu_state,
|
||||
sizeof(cpuState->extended_registers));
|
||||
#ifndef __x86_64__
|
||||
} else {
|
||||
x86_fnsave(&cpuState->extended_registers);
|
||||
x86_frstor(&cpuState->extended_registers);
|
||||
// fnsave reinits the FPU state after saving, so we need to
|
||||
// load it again
|
||||
// TODO: Convert to fxsave format!
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
get_iframe_registers(frame, cpuState);
|
||||
}
|
||||
}
|
||||
|
@ -70,6 +70,7 @@ dummy()
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, r8);
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, r9);
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, r10);
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, fpu);
|
||||
#else
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, orig_eax);
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user