kernel/x86_64: AVX support

xsave or xsavec are supported.
breaks vregs compatibility.
change the thread structure object cache alignment to 64
the xsave fpu_state size isn't defined, it is for instance 832 here, thus I picked 1024.

Change-Id: I4a0cab0bc42c1d37f24dcafb8259f8ff24a330d2
Reviewed-on: https://review.haiku-os.org/c/haiku/+/2849
Reviewed-by: Adrien Destugues <pulkomandy@gmail.com>
This commit is contained in:
Jérôme Duval 2020-05-05 23:03:39 +02:00 committed by Adrien Destugues
parent 734c1e0491
commit 9495126984
10 changed files with 191 additions and 27 deletions

View File

@ -89,6 +89,18 @@ struct fpu_state {
unsigned char _reserved_416_511[96];
};
struct xstate_hdr {
unsigned long bv;
unsigned long xcomp_bv;
unsigned char _reserved[48];
};
struct savefpu {
struct fpu_state fp_fxsave;
struct xstate_hdr fp_xstate;
unsigned long fp_ymm[16][2];
};
struct vregs {
unsigned long rax;
unsigned long rbx;
@ -110,7 +122,7 @@ struct vregs {
unsigned long rip;
unsigned long rflags;
struct fpu_state fpu;
struct savefpu fpu;
};

View File

@ -22,6 +22,8 @@
#define ALTCODEPATCH_TAG_STAC 1
#define ALTCODEPATCH_TAG_CLAC 2
#define ALTCODEPATCH_TAG_XSAVE 3
#define ALTCODEPATCH_TAG_XRSTOR 4
#ifdef _ASSEMBLER

View File

@ -354,9 +354,15 @@
#define IA32_CR4_GLOBAL_PAGES (1UL << 7)
#define CR4_OS_FXSR (1UL << 9)
#define CR4_OS_XMM_EXCEPTION (1UL << 10)
#define IA32_CR4_OSXSAVE (1UL << 18)
#define IA32_CR4_SMEP (1UL << 20)
#define IA32_CR4_SMAP (1UL << 21)
// Extended Control Register XCR0 flags
#define IA32_XCR0_X87 (1UL << 0)
#define IA32_XCR0_SSE (1UL << 1)
#define IA32_XCR0_AVX (1UL << 2)
// page fault error codes (http://wiki.osdev.org/Page_Fault)
#define PGFAULT_P 0x01 // Protection violation
#define PGFAULT_W 0x02 // Write
@ -547,6 +553,16 @@ struct intel_microcode_extended_signature {
#define clear_ac() \
__asm__ volatile (ASM_CLAC : : : "memory")
#define xgetbv(reg) ({ \
uint32 low, high; \
__asm__ volatile ("xgetbv" : "=a" (low), "=d" (high), "c" (reg)); \
(low | (uint64)high << 32); \
})
#define xsetbv(reg, value) { \
uint32 low = value; uint32 high = value >> 32; \
__asm__ volatile ("xsetbv" : : "a" (low), "d" (high), "c" (reg)); }
#define out8(value,port) \
__asm__ ("outb %%al,%%dx" : : "a" (value), "d" (port))

View File

@ -53,8 +53,13 @@ struct arch_thread {
struct farcall interrupt_stack;
#endif
#ifndef __x86_64__
// 512 byte floating point save point - this must be 16 byte aligned
uint8 fpu_state[512] _ALIGNED(16);
#else
// floating point save point - this must be 64 byte aligned for xsave
uint8 fpu_state[1024] _ALIGNED(64);
#endif
addr_t GetFramePointer() const;
} _ALIGNED(16);

View File

@ -119,3 +119,16 @@ FUNCTION_END(_stac)
FUNCTION(_clac):
clac
FUNCTION_END(_clac)
FUNCTION(_xsave):
xsave64 (%rdi)
FUNCTION_END(_xsave)
FUNCTION(_xsavec):
xsavec64 (%rdi)
FUNCTION_END(_xsavec)
FUNCTION(_xrstor):
xrstor64 (%rdi)
FUNCTION_END(_xrstor)

View File

@ -221,16 +221,32 @@ STATIC_FUNCTION(int_bottom):
// exception.
orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp)
subq $512, %rsp
andq $~15, %rsp
fxsaveq (%rsp)
// xsave needs a 64-byte alignment
andq $~63, %rsp
movq (gFPUSaveLength), %rcx
subq %rcx, %rsp
leaq (%rsp), %rdi
shrq $3, %rcx
movq $0, %rax
rep stosq
movl (gXsaveMask), %eax
movl (gXsaveMask+4), %edx
movq %rsp, %rdi
CODEPATCH_START
fxsaveq (%rdi)
CODEPATCH_END(ALTCODEPATCH_TAG_XSAVE)
// Call the interrupt handler.
movq %rbp, %rdi
movq IFRAME_vector(%rbp), %rax
call *gInterruptHandlerTable(, %rax, 8)
fxrstorq (%rsp)
movl (gXsaveMask), %eax
movl (gXsaveMask+4), %edx
movq %rsp, %rdi
CODEPATCH_START
fxrstorq (%rdi)
CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
movq %rbp, %rsp
// Restore the saved registers.
@ -253,9 +269,22 @@ STATIC_FUNCTION(int_bottom_user):
// Frame pointer is the iframe.
movq %rsp, %rbp
subq $512, %rsp
andq $~15, %rsp
fxsaveq (%rsp)
// xsave needs a 64-byte alignment
andq $~63, %rsp
movq (gFPUSaveLength), %rcx
subq %rcx, %rsp
leaq (%rsp), %rdi
shrq $3, %rcx
movq $0, %rax
rep stosq
movl (gXsaveMask), %eax
movl (gXsaveMask+4), %edx
movq %rsp, %rdi
CODEPATCH_START
fxsaveq (%rdi)
CODEPATCH_END(ALTCODEPATCH_TAG_XSAVE)
movq %rsp, IFRAME_fpu(%rbp)
// Set the RF (resume flag) in RFLAGS. This prevents an instruction
@ -286,7 +315,12 @@ STATIC_FUNCTION(int_bottom_user):
UPDATE_THREAD_KERNEL_TIME()
fxrstorq (%rsp)
movl (gXsaveMask), %eax
movl (gXsaveMask+4), %edx
movq %rsp, %rdi
CODEPATCH_START
fxrstorq (%rdi)
CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
movq %rbp, %rsp
// Restore the saved registers.
@ -315,7 +349,12 @@ STATIC_FUNCTION(int_bottom_user):
movq %rbp, %rdi
call x86_init_user_debug_at_kernel_exit
1:
fxrstorq (%rsp)
movl (gXsaveMask), %eax
movl (gXsaveMask+4), %edx
movq %rsp, %rdi
CODEPATCH_START
fxrstorq (%rdi)
CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
movq %rbp, %rsp
// Restore the saved registers.
@ -522,8 +561,13 @@ FUNCTION(x86_64_syscall_entry):
jmp .Liret
.Lrestore_fpu:
movq IFRAME_fpu(%rbp), %rax
fxrstorq (%rax)
movq IFRAME_fpu(%rbp), %rdi
movl (gXsaveMask), %eax
movl (gXsaveMask+4), %edx
CODEPATCH_START
fxrstorq (%rdi)
CODEPATCH_END(ALTCODEPATCH_TAG_XRSTOR)
.Liret:
// Restore the saved registers.
RESTORE_IFRAME()

View File

@ -68,7 +68,10 @@ class RestartSyscall : public AbstractTraceEntry {
extern "C" void x86_64_thread_entry();
// Initial thread saved state.
static arch_thread sInitialState;
static arch_thread sInitialState _ALIGNED(64);
extern uint64 gFPUSaveLength;
extern bool gHasXsave;
extern bool gHasXsavec;
void
@ -140,12 +143,36 @@ arch_thread_init(kernel_args* args)
{
// Save one global valid FPU state; it will be copied in the arch dependent
// part of each new thread.
asm volatile (
"clts;" \
"fninit;" \
"fnclex;" \
"fxsave %0;"
: "=m" (sInitialState.fpu_state));
if (gHasXsave || gHasXsavec) {
ASSERT(gFPUSaveLength <= sizeof(sInitialState.fpu_state));
memset(sInitialState.fpu_state, 0, gFPUSaveLength);
if (gHasXsavec) {
asm volatile (
"clts;" \
"fninit;" \
"fnclex;" \
"movl $0x7,%%eax;" \
"movl $0x0,%%edx;" \
"xsavec64 %0"
:: "m" (sInitialState.fpu_state));
} else {
asm volatile (
"clts;" \
"fninit;" \
"fnclex;" \
"movl $0x7,%%eax;" \
"movl $0x0,%%edx;" \
"xsave64 %0"
:: "m" (sInitialState.fpu_state));
}
} else {
asm volatile (
"clts;" \
"fninit;" \
"fnclex;" \
"fxsaveq %0"
:: "m" (sInitialState.fpu_state));
}
return B_OK;
}
@ -309,11 +336,10 @@ arch_setup_signal_frame(Thread* thread, struct sigaction* action,
if (frame->fpu != nullptr) {
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, frame->fpu,
sizeof(signalFrameData->context.uc_mcontext.fpu));
gFPUSaveLength);
} else {
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
sInitialState.fpu_state,
sizeof(signalFrameData->context.uc_mcontext.fpu));
sInitialState.fpu_state, gFPUSaveLength);
}
// Fill in signalFrameData->context.uc_stack.
@ -385,8 +411,7 @@ arch_restore_signal_frame(struct signal_frame_data* signalFrameData)
Thread* thread = thread_get_current_thread();
memcpy(thread->arch_info.fpu_state,
(void*)&signalFrameData->context.uc_mcontext.fpu,
sizeof(thread->arch_info.fpu_state));
(void*)&signalFrameData->context.uc_mcontext.fpu, gFPUSaveLength);
frame->fpu = &thread->arch_info.fpu_state;
// The syscall return code overwrites frame->ax with the return value of

View File

@ -52,6 +52,7 @@ arch_altcodepatch_replace(uint16 tag, void* newcodepatch, size_t length)
// disable write after patch
set_area_protection(info->text_region.id, kernelProtection);
dprintf("arch_altcodepatch_replace found %" B_PRIu32 " altcodepatches\n", count);
dprintf("arch_altcodepatch_replace found %" B_PRIu32 " altcodepatches "
"for tag %u\n", count, tag);
}

View File

@ -84,6 +84,13 @@ struct set_mtrrs_parameter {
#ifdef __x86_64__
extern addr_t _stac;
extern addr_t _clac;
extern addr_t _xsave;
extern addr_t _xsavec;
extern addr_t _xrstor;
uint64 gXsaveMask;
uint64 gFPUSaveLength = 512;
bool gHasXsave = false;
bool gHasXsavec = false;
#endif
extern "C" void x86_reboot(void);
@ -1406,6 +1413,20 @@ enable_smep(void* dummy, int cpu)
{
x86_write_cr4(x86_read_cr4() | IA32_CR4_SMEP);
}
static void
enable_osxsave(void* dummy, int cpu)
{
x86_write_cr4(x86_read_cr4() | IA32_CR4_OSXSAVE);
}
static void
enable_xsavemask(void* dummy, int cpu)
{
xsetbv(0, gXsaveMask);
}
#endif
@ -1459,6 +1480,31 @@ arch_cpu_init_post_vm(kernel_args* args)
} else
dprintf("SMAP disabled per safemode setting\n");
}
// if available enable XSAVE (XSAVE and extended states)
gHasXsave = x86_check_feature(IA32_FEATURE_EXT_XSAVE, FEATURE_EXT);
if (gHasXsave) {
gHasXsavec = x86_check_feature(IA32_FEATURE_XSAVEC,
FEATURE_D_1_EAX);
call_all_cpus_sync(&enable_osxsave, NULL);
gXsaveMask = IA32_XCR0_X87 | IA32_XCR0_SSE;
cpuid_info cpuid;
get_current_cpuid(&cpuid, 0xd, 0);
gXsaveMask |= (cpuid.regs.eax & IA32_XCR0_AVX);
call_all_cpus_sync(&enable_xsavemask, NULL);
get_current_cpuid(&cpuid, 0xd, 0);
gFPUSaveLength = cpuid.regs.ebx;
arch_altcodepatch_replace(ALTCODEPATCH_TAG_XSAVE,
gHasXsavec ? &_xsavec : &_xsave, 4);
arch_altcodepatch_replace(ALTCODEPATCH_TAG_XRSTOR,
&_xrstor, 4);
dprintf("enable %s 0x%" B_PRIx64 " %" B_PRId64 "\n",
gHasXsavec ? "XSAVEC" : "XSAVE", gXsaveMask, gFPUSaveLength);
}
#endif
return B_OK;

View File

@ -2679,9 +2679,9 @@ thread_init(kernel_args *args)
panic("thread_init(): failed to init thread hash table!");
// create the thread structure object cache
sThreadCache = create_object_cache("threads", sizeof(Thread), 16, NULL,
sThreadCache = create_object_cache("threads", sizeof(Thread), 64, NULL,
NULL, NULL);
// Note: The x86 port requires 16 byte alignment of thread structures.
// Note: The x86 port requires 64 byte alignment of thread structures.
if (sThreadCache == NULL)
panic("thread_init(): failed to allocate thread object cache!");