diff --git a/headers/private/kernel/arch/x86/64/cpu.h b/headers/private/kernel/arch/x86/64/cpu.h index a29891d5de..02860371b8 100644 --- a/headers/private/kernel/arch/x86/64/cpu.h +++ b/headers/private/kernel/arch/x86/64/cpu.h @@ -28,6 +28,10 @@ x86_write_msr(uint32_t msr, uint64_t value) static inline void x86_context_switch(arch_thread* oldState, arch_thread* newState) { + uint16_t fpuControl; + asm volatile("fnstcw %0" : "=m" (fpuControl)); + uint32_t sseControl; + asm volatile("stmxcsr %0" : "=m" (sseControl)); asm volatile( "pushq %%rbp;" "movq $1f, %c[rip](%0);" @@ -41,7 +45,11 @@ x86_context_switch(arch_thread* oldState, arch_thread* newState) [rsp] "i" (offsetof(arch_thread, current_stack)), [rip] "i" (offsetof(arch_thread, instruction_pointer)) : "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", - "r14", "r15", "memory"); + "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", + "xmm14", "xmm15", "memory"); + asm volatile("ldmxcsr %0" : : "m" (sseControl)); + asm volatile("fldcw %0" : : "m" (fpuControl)); } diff --git a/headers/private/kernel/arch/x86/64/iframe.h b/headers/private/kernel/arch/x86/64/iframe.h index 2320949c5b..d4b61829db 100644 --- a/headers/private/kernel/arch/x86/64/iframe.h +++ b/headers/private/kernel/arch/x86/64/iframe.h @@ -8,6 +8,7 @@ struct iframe { uint64 type; + void* fpu; uint64 r15; uint64 r14; uint64 r13; diff --git a/headers/private/kernel/arch/x86/arch_cpu.h b/headers/private/kernel/arch/x86/arch_cpu.h index c6d4c11c5e..a202d08f4a 100644 --- a/headers/private/kernel/arch/x86/arch_cpu.h +++ b/headers/private/kernel/arch/x86/arch_cpu.h @@ -454,10 +454,7 @@ void __x86_setup_system_time(uint32 conversionFactor, void x86_userspace_thread_exit(void); void x86_end_userspace_thread_exit(void); -void x86_fxsave(void* fpuState); -void x86_fxrstor(const void* fpuState); -void x86_noop_swap(void* oldFpuState, const void* newFpuState); -void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); + addr_t x86_get_stack_frame(); uint32 x86_count_mtrrs(void); void x86_set_mtrr(uint32 index, uint64 base, uint64 length, uint8 type); @@ -488,7 +485,13 @@ void x86_context_switch(struct arch_thread* oldState, void x86_fnsave(void* fpuState); void x86_frstor(const void* fpuState); + +void x86_fxsave(void* fpuState); +void x86_fxrstor(const void* fpuState); + +void x86_noop_swap(void* oldFpuState, const void* newFpuState); void x86_fnsave_swap(void* oldFpuState, const void* newFpuState); +void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); #endif diff --git a/src/system/kernel/arch/x86/64/arch.S b/src/system/kernel/arch/x86/64/arch.S index 4cdd22af84..19c74b4d2d 100644 --- a/src/system/kernel/arch/x86/64/arch.S +++ b/src/system/kernel/arch/x86/64/arch.S @@ -19,35 +19,6 @@ .text -/* void x86_fxsave(void* fpuState); */ -FUNCTION(x86_fxsave): - fxsave (%rdi) - ret -FUNCTION_END(x86_fxsave) - - -/* void x86_fxrstor(const void* fpuState); */ -FUNCTION(x86_fxrstor): - fxrstor (%rdi) - ret -FUNCTION_END(x86_fxrstor) - - -/* void x86_noop_swap(void *oldFpuState, const void *newFpuState); */ -FUNCTION(x86_noop_swap): - nop - ret -FUNCTION_END(x86_noop_swap) - - -/* void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); */ -FUNCTION(x86_fxsave_swap): - fxsave (%rdi) - fxrstor (%rsi) - ret -FUNCTION_END(x86_fxsave_swap) - - /* addr_t x86_get_stack_frame(); */ FUNCTION(x86_get_stack_frame): mov %rbp, %rax diff --git a/src/system/kernel/arch/x86/64/interrupts.S b/src/system/kernel/arch/x86/64/interrupts.S index a63cd009b8..50ba3ea384 100644 --- a/src/system/kernel/arch/x86/64/interrupts.S +++ b/src/system/kernel/arch/x86/64/interrupts.S @@ -35,11 +35,13 @@ push %r13; \ push %r14; \ push %r15; \ + pushq $0; \ push $iframeType; + // Restore the interrupt frame. #define RESTORE_IFRAME() \ - add $8, %rsp; \ + add $16, %rsp; \ pop %r15; \ pop %r14; \ pop %r13; \ @@ -198,11 +200,18 @@ STATIC_FUNCTION(int_bottom): // exception. orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp) + subq $512, %rsp + andq $~15, %rsp + fxsaveq (%rsp) + // Call the interrupt handler. - movq %rsp, %rdi - movq IFRAME_vector(%rsp), %rax + movq %rbp, %rdi + movq IFRAME_vector(%rbp), %rax call *gInterruptHandlerTable(, %rax, 8) + fxrstorq (%rsp) + movq %rbp, %rsp + // Restore the saved registers. RESTORE_IFRAME() @@ -217,12 +226,16 @@ STATIC_FUNCTION(int_bottom_user): // Push the rest of the interrupt frame to the stack. PUSH_IFRAME_BOTTOM(IFRAME_TYPE_OTHER) - cld // Frame pointer is the iframe. movq %rsp, %rbp + subq $512, %rsp + andq $~15, %rsp + fxsaveq (%rsp) + movq %rsp, IFRAME_fpu(%rbp) + // Set the RF (resume flag) in RFLAGS. This prevents an instruction // breakpoint on the instruction we're returning to to trigger a debug // exception. @@ -235,8 +248,8 @@ STATIC_FUNCTION(int_bottom_user): UPDATE_THREAD_USER_TIME() // Call the interrupt handler. - movq %rsp, %rdi - movq IFRAME_vector(%rsp), %rax + movq %rbp, %rdi + movq IFRAME_vector(%rbp), %rax call *gInterruptHandlerTable(, %rax, 8) // If there are no signals pending or we're not debugging, we can avoid @@ -250,6 +263,9 @@ STATIC_FUNCTION(int_bottom_user): UPDATE_THREAD_KERNEL_TIME() + fxrstorq (%rsp) + movq %rbp, %rsp + // Restore the saved registers. RESTORE_IFRAME() @@ -274,6 +290,9 @@ STATIC_FUNCTION(int_bottom_user): movq %rbp, %rdi call x86_init_user_debug_at_kernel_exit 1: + fxrstorq (%rsp) + movq %rbp, %rsp + // Restore the saved registers. RESTORE_IFRAME() @@ -395,7 +414,7 @@ FUNCTION(x86_64_syscall_entry): // If we've just restored a signal frame, use the IRET path. cmpq $SYSCALL_RESTORE_SIGNAL_FRAME, %r14 - je .Liret + je .Lrestore_fpu // Restore the iframe and RCX/R11 for SYSRET. RESTORE_IFRAME() @@ -466,7 +485,11 @@ FUNCTION(x86_64_syscall_entry): // On this return path it is possible that the frame has been modified, // for example to execute a signal handler. In this case it is safer to // return via IRET. + jmp .Liret +.Lrestore_fpu: + movq IFRAME_fpu(%rbp), %rax + fxrstorq (%rax) .Liret: // Restore the saved registers. RESTORE_IFRAME() @@ -537,7 +560,7 @@ FUNCTION(x86_return_to_userland): testl $(THREAD_FLAGS_DEBUGGER_INSTALLED | THREAD_FLAGS_SIGNALS_PENDING \ | THREAD_FLAGS_DEBUG_THREAD | THREAD_FLAGS_BREAKPOINTS_DEFINED) \ , THREAD_flags(%r12) - jnz .Lkernel_exit_work + jnz .Luserland_return_work // update the thread's kernel time and return UPDATE_THREAD_KERNEL_TIME() @@ -546,4 +569,33 @@ FUNCTION(x86_return_to_userland): RESTORE_IFRAME() swapgs iretq +.Luserland_return_work: + // Slow path for return to userland. + + // Do we need to handle signals? + testl $(THREAD_FLAGS_SIGNALS_PENDING | THREAD_FLAGS_DEBUG_THREAD) \ + , THREAD_flags(%r12) + jnz .Luserland_return_handle_signals + cli + call thread_at_kernel_exit_no_signals + +.Luserland_return_work_done: + // Install breakpoints, if defined. + testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12) + jz 1f + movq %rbp, %rdi + call x86_init_user_debug_at_kernel_exit +1: + // Restore the saved registers. + RESTORE_IFRAME() + + // Restore the previous GS base and return. + swapgs + iretq +.Luserland_return_handle_signals: + // thread_at_kernel_exit requires interrupts to be enabled, it will disable + // them after. + sti + call thread_at_kernel_exit + jmp .Luserland_return_work_done FUNCTION_END(x86_return_to_userland) diff --git a/src/system/kernel/arch/x86/64/thread.cpp b/src/system/kernel/arch/x86/64/thread.cpp index 961a2534c1..9e535fefbe 100644 --- a/src/system/kernel/arch/x86/64/thread.cpp +++ b/src/system/kernel/arch/x86/64/thread.cpp @@ -134,9 +134,12 @@ arch_thread_init(kernel_args* args) { // Save one global valid FPU state; it will be copied in the arch dependent // part of each new thread. - asm volatile ("clts; fninit; fnclex;"); - x86_fxsave(sInitialState.fpu_state); - + asm volatile ( + "clts;" \ + "fninit;" \ + "fnclex;" \ + "fxsave %0;" + : "=m" (sInitialState.fpu_state)); return B_OK; } @@ -296,15 +299,14 @@ arch_setup_signal_frame(Thread* thread, struct sigaction* action, signalFrameData->context.uc_mcontext.rip = frame->ip; signalFrameData->context.uc_mcontext.rflags = frame->flags; - // Store the FPU state. There appears to be a bug in GCC where the aligned - // attribute on a structure is being ignored when the structure is allocated - // on the stack, so even if the fpu_state struct has aligned(16) it may not - // get aligned correctly. Instead, use the current thread's FPU save area - // and then memcpy() to the frame structure. - x86_fxsave(thread->arch_info.fpu_state); - memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, - thread->arch_info.fpu_state, - sizeof(signalFrameData->context.uc_mcontext.fpu)); + if (frame->fpu != nullptr) { + memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, frame->fpu, + sizeof(signalFrameData->context.uc_mcontext.fpu)); + } else { + memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, + sInitialState.fpu_state, + sizeof(signalFrameData->context.uc_mcontext.fpu)); + } // Fill in signalFrameData->context.uc_stack. signal_get_user_stack(frame->user_sp, &signalFrameData->context.uc_stack); @@ -370,13 +372,12 @@ arch_restore_signal_frame(struct signal_frame_data* signalFrameData) frame->flags = (frame->flags & ~(uint64)X86_EFLAGS_USER_FLAGS) | (signalFrameData->context.uc_mcontext.rflags & X86_EFLAGS_USER_FLAGS); - // Same as above, alignment may not be correct. Copy to thread and restore - // from there. Thread* thread = thread_get_current_thread(); + memcpy(thread->arch_info.fpu_state, (void*)&signalFrameData->context.uc_mcontext.fpu, sizeof(thread->arch_info.fpu_state)); - x86_fxrstor(thread->arch_info.fpu_state); + frame->fpu = &thread->arch_info.fpu_state; // The syscall return code overwrites frame->ax with the return value of // the syscall, need to return it here to ensure the correct value is diff --git a/src/system/kernel/arch/x86/arch_cpu.cpp b/src/system/kernel/arch/x86/arch_cpu.cpp index 6cd2628ab5..65ea7fce1c 100644 --- a/src/system/kernel/arch/x86/arch_cpu.cpp +++ b/src/system/kernel/arch/x86/arch_cpu.cpp @@ -82,8 +82,10 @@ extern "C" void x86_reboot(void); // from arch.S void (*gCpuIdleFunc)(void); +#ifndef __x86_64__ void (*gX86SwapFPUFunc)(void* oldState, const void* newState) = x86_noop_swap; bool gHasSSE = false; +#endif static uint32 sCpuRendezvous; static uint32 sCpuRendezvous2; @@ -318,13 +320,14 @@ x86_init_fpu(void) #endif dprintf("%s: CPU has SSE... enabling FXSR and XMM.\n", __func__); - +#ifndef __x86_64__ // enable OS support for SSE x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION); x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU)); gX86SwapFPUFunc = x86_fxsave_swap; gHasSSE = true; +#endif } diff --git a/src/system/kernel/arch/x86/arch_thread.cpp b/src/system/kernel/arch/x86/arch_thread.cpp index 25cde443ed..931c74b099 100644 --- a/src/system/kernel/arch/x86/arch_thread.cpp +++ b/src/system/kernel/arch/x86/arch_thread.cpp @@ -31,7 +31,9 @@ extern "C" void x86_return_to_userland(iframe* frame); // from arch_cpu.cpp +#ifndef __x86_64__ extern void (*gX86SwapFPUFunc)(void *oldState, const void *newState); +#endif static struct iframe* @@ -245,7 +247,9 @@ arch_thread_context_switch(Thread* from, Thread* to) activePagingStructures->RemoveReference(); } +#ifndef __x86_64__ gX86SwapFPUFunc(from->arch_info.fpu_state, to->arch_info.fpu_state); +#endif x86_context_switch(&from->arch_info, &to->arch_info); } diff --git a/src/system/kernel/arch/x86/arch_user_debugger.cpp b/src/system/kernel/arch/x86/arch_user_debugger.cpp index 516ed8f340..8b621d9e3c 100644 --- a/src/system/kernel/arch/x86/arch_user_debugger.cpp +++ b/src/system/kernel/arch/x86/arch_user_debugger.cpp @@ -33,7 +33,9 @@ // TODO: Make those real error codes. +#ifndef __x86_64__ extern bool gHasSSE; +#endif // The software breakpoint instruction (int3). const uint8 kX86SoftwareBreakpoint[1] = { 0xcc }; @@ -688,6 +690,12 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState) if (iframe* frame = x86_get_user_iframe()) { // For the floating point state to be correct the calling function must // not use these registers (not even indirectly). +#ifdef __x86_64__ + Thread* thread = thread_get_current_thread(); + memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers, + sizeof(cpuState->extended_registers)); + frame->fpu = &thread->arch_info.fpu_state; +#else if (gHasSSE) { // Since fxrstor requires 16-byte alignment and this isn't // guaranteed passed buffer, we use our thread's fpu_state field as @@ -698,12 +706,11 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState) memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers, sizeof(cpuState->extended_registers)); x86_fxrstor(thread->arch_info.fpu_state); -#ifndef __x86_64__ } else { // TODO: Implement! We need to convert the format first. // x86_frstor(&cpuState->extended_registers); -#endif } +#endif set_iframe_registers(frame, cpuState); } } @@ -715,6 +722,15 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState) if (iframe* frame = x86_get_user_iframe()) { // For the floating point state to be correct the calling function must // not use these registers (not even indirectly). +#ifdef __x86_64__ + if (frame->fpu != nullptr) { + memcpy(&cpuState->extended_registers, frame->fpu, + sizeof(cpuState->extended_registers)); + } else { + memset(&cpuState->extended_registers, 0, + sizeof(cpuState->extended_registers)); + } +#else if (gHasSSE) { // Since fxsave requires 16-byte alignment and this isn't guaranteed // passed buffer, we use our thread's fpu_state field as temporary @@ -725,15 +741,14 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState) // unlike fnsave, fxsave doesn't reinit the FPU state memcpy(&cpuState->extended_registers, thread->arch_info.fpu_state, sizeof(cpuState->extended_registers)); -#ifndef __x86_64__ } else { x86_fnsave(&cpuState->extended_registers); x86_frstor(&cpuState->extended_registers); // fnsave reinits the FPU state after saving, so we need to // load it again // TODO: Convert to fxsave format! -#endif } +#endif get_iframe_registers(frame, cpuState); } } diff --git a/src/system/kernel/arch/x86/asm_offsets.cpp b/src/system/kernel/arch/x86/asm_offsets.cpp index db89298604..d89200bf87 100644 --- a/src/system/kernel/arch/x86/asm_offsets.cpp +++ b/src/system/kernel/arch/x86/asm_offsets.cpp @@ -70,6 +70,7 @@ dummy() DEFINE_OFFSET_MACRO(IFRAME, iframe, r8); DEFINE_OFFSET_MACRO(IFRAME, iframe, r9); DEFINE_OFFSET_MACRO(IFRAME, iframe, r10); + DEFINE_OFFSET_MACRO(IFRAME, iframe, fpu); #else DEFINE_OFFSET_MACRO(IFRAME, iframe, orig_eax); #endif