From 6156a508adb812153113f01aa1e547fff1e41bdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Sat, 6 Sep 2014 16:25:00 +0200 Subject: [PATCH 1/8] kernel/x86[_64]: remove get_optimized_functions from cpu modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The possibility to specify custom memcpy and memset implementations in cpu modules is currently unused and there is generally no point in such feature. There are only 2 x86 vendors that really matter and there isn't very big difference in performance of the generic optmized versions of these funcions across different models. Even if we wanted different versions of memset and memcpy depending on the processor model or features much better solution would be to use STT_GNU_IFUNC and save one indirect call. Long story short, we don't really benefit in any way from get_optimized_functions and the feature it implements and it only adds unnecessary complexity to the code. Signed-off-by: Paweł Dziepak --- headers/private/kernel/arch/x86/arch_cpu.h | 9 ---- src/system/kernel/arch/x86/arch_cpu.cpp | 41 +++---------------- src/system/kernel/arch/x86/asm_offsets.cpp | 6 --- src/system/kernel/lib/arch/x86/arch_string.S | 33 +++------------ .../kernel/lib/arch/x86_64/arch_string.S | 21 +++------- 5 files changed, 16 insertions(+), 94 deletions(-) diff --git a/headers/private/kernel/arch/x86/arch_cpu.h b/headers/private/kernel/arch/x86/arch_cpu.h index 88c5eb8e6a..800f489300 100644 --- a/headers/private/kernel/arch/x86/arch_cpu.h +++ b/headers/private/kernel/arch/x86/arch_cpu.h @@ -262,13 +262,6 @@ typedef struct x86_mtrr_info { uint8 type; } x86_mtrr_info; -typedef struct x86_optimized_functions { - void (*memcpy)(void* dest, const void* source, size_t count); - void* memcpy_end; - void (*memset)(void* dest, int value, size_t count); - void* memset_end; -} x86_optimized_functions; - typedef struct x86_cpu_module_info { module_info info; uint32 (*count_mtrrs)(void); @@ -280,8 +273,6 @@ typedef struct x86_cpu_module_info { uint8* _type); void (*set_mtrrs)(uint8 defaultType, const x86_mtrr_info* infos, uint32 count); - - void (*get_optimized_functions)(x86_optimized_functions* functions); } x86_cpu_module_info; // features diff --git a/src/system/kernel/arch/x86/arch_cpu.cpp b/src/system/kernel/arch/x86/arch_cpu.cpp index 93d58f81e4..9fd51ae6d6 100644 --- a/src/system/kernel/arch/x86/arch_cpu.cpp +++ b/src/system/kernel/arch/x86/arch_cpu.cpp @@ -105,17 +105,8 @@ static const size_t kDoubleFaultStackSize = 4096; // size per CPU static x86_cpu_module_info* sCpuModule; -extern "C" void memcpy_generic(void* dest, const void* source, size_t count); -extern int memcpy_generic_end; -extern "C" void memset_generic(void* dest, int value, size_t count); -extern int memset_generic_end; - -x86_optimized_functions gOptimizedFunctions = { - memcpy_generic, - &memcpy_generic_end, - memset_generic, - &memset_generic_end -}; +extern int memcpy_end; +extern int memset_end; /* CPU topology information */ static uint32 (*sGetCPUTopologyID)(int currentCPU); @@ -1163,33 +1154,13 @@ arch_cpu_init_post_modules(kernel_args* args) call_all_cpus(&init_mtrrs, NULL); } - // get optimized functions from the CPU module - if (sCpuModule != NULL && sCpuModule->get_optimized_functions != NULL) { - x86_optimized_functions functions; - memset(&functions, 0, sizeof(functions)); - - sCpuModule->get_optimized_functions(&functions); - - if (functions.memcpy != NULL) { - gOptimizedFunctions.memcpy = functions.memcpy; - gOptimizedFunctions.memcpy_end = functions.memcpy_end; - } - - if (functions.memset != NULL) { - gOptimizedFunctions.memset = functions.memset; - gOptimizedFunctions.memset_end = functions.memset_end; - } - } - // put the optimized functions into the commpage - size_t memcpyLen = (addr_t)gOptimizedFunctions.memcpy_end - - (addr_t)gOptimizedFunctions.memcpy; + size_t memcpyLen = (addr_t)&memcpy_end - (addr_t)memcpy; addr_t memcpyPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMCPY, - (const void*)gOptimizedFunctions.memcpy, memcpyLen); - size_t memsetLen = (addr_t)gOptimizedFunctions.memset_end - - (addr_t)gOptimizedFunctions.memset; + (const void*)memcpy, memcpyLen); + size_t memsetLen = (addr_t)&memset_end - (addr_t)memset; addr_t memsetPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMSET, - (const void*)gOptimizedFunctions.memset, memsetLen); + (const void*)memset, memsetLen); size_t threadExitLen = (addr_t)x86_end_userspace_thread_exit - (addr_t)x86_userspace_thread_exit; addr_t threadExitPosition = fill_commpage_entry( diff --git a/src/system/kernel/arch/x86/asm_offsets.cpp b/src/system/kernel/arch/x86/asm_offsets.cpp index 787fef10e9..db89298604 100644 --- a/src/system/kernel/arch/x86/asm_offsets.cpp +++ b/src/system/kernel/arch/x86/asm_offsets.cpp @@ -79,12 +79,6 @@ dummy() DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, function); DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, parameter_size); - // struct x86_optimized_functions - DEFINE_OFFSET_MACRO(X86_OPTIMIZED_FUNCTIONS, x86_optimized_functions, - memcpy); - DEFINE_OFFSET_MACRO(X86_OPTIMIZED_FUNCTIONS, x86_optimized_functions, - memset); - // struct signal_frame_data DEFINE_SIZEOF_MACRO(SIGNAL_FRAME_DATA, signal_frame_data); DEFINE_OFFSET_MACRO(SIGNAL_FRAME_DATA, signal_frame_data, info); diff --git a/src/system/kernel/lib/arch/x86/arch_string.S b/src/system/kernel/lib/arch/x86/arch_string.S index 62630627b2..7a3480b2c5 100644 --- a/src/system/kernel/lib/arch/x86/arch_string.S +++ b/src/system/kernel/lib/arch/x86/arch_string.S @@ -6,22 +6,12 @@ * Distributed under the terms of the NewOS License. */ -#if !_BOOT_MODE -# include "asm_offsets.h" -#endif #include -// We don't need the indirection in the boot loader. -#if _BOOT_MODE -# define memcpy_generic memcpy -# define memset_generic memset -#endif - - .align 4 -FUNCTION(memcpy_generic): +FUNCTION(memcpy): pushl %esi pushl %edi movl 12(%esp),%edi /* dest */ @@ -45,13 +35,13 @@ FUNCTION(memcpy_generic): popl %edi popl %esi ret -FUNCTION_END(memcpy_generic) -SYMBOL(memcpy_generic_end): +FUNCTION_END(memcpy) +SYMBOL(memcpy_end): /* void *memset(void *dest, int value, size_t length); */ .align 4 -FUNCTION(memset_generic): +FUNCTION(memset): push %ebp mov %esp, %ebp @@ -111,19 +101,6 @@ FUNCTION(memset_generic): mov %ebp, %esp pop %ebp ret -FUNCTION_END(memset_generic) -SYMBOL(memset_generic_end): - - -#if !_BOOT_MODE - -.align 4 -FUNCTION(memcpy): - jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memcpy) -FUNCTION_END(memcpy) - -FUNCTION(memset): - jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memset) FUNCTION_END(memset) +SYMBOL(memset_end): -#endif // !_BOOT_MODE diff --git a/src/system/kernel/lib/arch/x86_64/arch_string.S b/src/system/kernel/lib/arch/x86_64/arch_string.S index a24bbc8ff6..f0849e8fd8 100644 --- a/src/system/kernel/lib/arch/x86_64/arch_string.S +++ b/src/system/kernel/lib/arch/x86_64/arch_string.S @@ -6,11 +6,9 @@ #include -#include "asm_offsets.h" - .align 8 -FUNCTION(memcpy_generic): +FUNCTION(memcpy): push %rbp movq %rsp, %rbp @@ -59,12 +57,12 @@ FUNCTION(memcpy_generic): pop %rbp ret -FUNCTION_END(memcpy_generic) -SYMBOL(memcpy_generic_end): +FUNCTION_END(memcpy) +SYMBOL(memcpy_end): .align 8 -FUNCTION(memset_generic): +FUNCTION(memset): push %rbp movq %rsp, %rbp @@ -82,15 +80,6 @@ FUNCTION(memset_generic): movq %r8, %rax pop %rbp ret -FUNCTION_END(memset_generic) -SYMBOL(memset_generic_end): - - -FUNCTION(memcpy): - jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memcpy) -FUNCTION_END(memcpy) - -FUNCTION(memset): - jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memset) FUNCTION_END(memset) +SYMBOL(memset_end): From f2f91078bdfb4cc008c2f87af2bcc4aedec85cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Sat, 6 Sep 2014 16:41:58 +0200 Subject: [PATCH 2/8] kernel/x86_64: remove memset and memcpy from commpage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is absolutely no reason for these functions to be in commpage, they don't do anything that involves the kernel in any way. Additionaly, this patch rewrites memset and memcpy to C++, current implementation is quite simple (though it may perform surprisingly well when dealing with large buffers on cpus with ermsb). Better versions are coming soon. Signed-off-by: Paweł Dziepak --- .../system/arch/x86_64/arch_commpage_defs.h | 6 +- src/system/kernel/arch/x86/32/syscalls.cpp | 16 ++++ src/system/kernel/arch/x86/arch_cpu.cpp | 15 +--- src/system/kernel/lib/arch/x86_64/Jamfile | 7 +- .../kernel/lib/arch/x86_64/arch_string.S | 85 ------------------- .../libroot/posix/string/arch/x86_64/Jamfile | 4 +- .../posix/string/arch/x86_64/arch_string.S | 22 ----- .../posix/string/arch/x86_64/arch_string.cpp | 31 +++++++ 8 files changed, 55 insertions(+), 131 deletions(-) delete mode 100644 src/system/kernel/lib/arch/x86_64/arch_string.S delete mode 100644 src/system/libroot/posix/string/arch/x86_64/arch_string.S create mode 100644 src/system/libroot/posix/string/arch/x86_64/arch_string.cpp diff --git a/headers/private/system/arch/x86_64/arch_commpage_defs.h b/headers/private/system/arch/x86_64/arch_commpage_defs.h index 85fa54e104..1451011d18 100644 --- a/headers/private/system/arch/x86_64/arch_commpage_defs.h +++ b/headers/private/system/arch/x86_64/arch_commpage_defs.h @@ -9,11 +9,9 @@ # error Must not be included directly. Include instead! #endif -#define COMMPAGE_ENTRY_X86_MEMCPY (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 0) -#define COMMPAGE_ENTRY_X86_MEMSET (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 1) #define COMMPAGE_ENTRY_X86_SIGNAL_HANDLER \ - (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 2) + (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 0) #define COMMPAGE_ENTRY_X86_THREAD_EXIT \ - (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 3) + (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 1) #endif /* _SYSTEM_ARCH_x86_64_COMMPAGE_DEFS_H */ diff --git a/src/system/kernel/arch/x86/32/syscalls.cpp b/src/system/kernel/arch/x86/32/syscalls.cpp index c3ca4e1a24..7fcee05a3c 100644 --- a/src/system/kernel/arch/x86/32/syscalls.cpp +++ b/src/system/kernel/arch/x86/32/syscalls.cpp @@ -30,6 +30,10 @@ extern "C" void x86_sysenter(); void (*gX86SetSyscallStack)(addr_t stackTop) = NULL; +extern int memcpy_end; +extern int memset_end; + + static bool all_cpus_have_feature(enum x86_feature_type type, int feature) { @@ -109,8 +113,20 @@ x86_initialize_syscall(void) addr_t position = fill_commpage_entry(COMMPAGE_ENTRY_X86_SYSCALL, syscallCode, len); + // put the optimized functions into the commpage + size_t memcpyLen = (addr_t)&memcpy_end - (addr_t)memcpy; + addr_t memcpyPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMCPY, + (const void*)memcpy, memcpyLen); + size_t memsetLen = (addr_t)&memset_end - (addr_t)memset; + addr_t memsetPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMSET, + (const void*)memset, memsetLen); + // add syscall to the commpage image image_id image = get_commpage_image(); + elf_add_memory_image_symbol(image, "commpage_memcpy", memcpyPosition, + memcpyLen, B_SYMBOL_TYPE_TEXT); + elf_add_memory_image_symbol(image, "commpage_memset", memsetPosition, + memsetLen, B_SYMBOL_TYPE_TEXT); elf_add_memory_image_symbol(image, "commpage_syscall", position, len, B_SYMBOL_TYPE_TEXT); } diff --git a/src/system/kernel/arch/x86/arch_cpu.cpp b/src/system/kernel/arch/x86/arch_cpu.cpp index 9fd51ae6d6..8a2f7832d1 100644 --- a/src/system/kernel/arch/x86/arch_cpu.cpp +++ b/src/system/kernel/arch/x86/arch_cpu.cpp @@ -105,9 +105,6 @@ static const size_t kDoubleFaultStackSize = 4096; // size per CPU static x86_cpu_module_info* sCpuModule; -extern int memcpy_end; -extern int memset_end; - /* CPU topology information */ static uint32 (*sGetCPUTopologyID)(int currentCPU); static uint32 sHierarchyMask[CPU_TOPOLOGY_LEVELS]; @@ -1154,13 +1151,6 @@ arch_cpu_init_post_modules(kernel_args* args) call_all_cpus(&init_mtrrs, NULL); } - // put the optimized functions into the commpage - size_t memcpyLen = (addr_t)&memcpy_end - (addr_t)memcpy; - addr_t memcpyPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMCPY, - (const void*)memcpy, memcpyLen); - size_t memsetLen = (addr_t)&memset_end - (addr_t)memset; - addr_t memsetPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMSET, - (const void*)memset, memsetLen); size_t threadExitLen = (addr_t)x86_end_userspace_thread_exit - (addr_t)x86_userspace_thread_exit; addr_t threadExitPosition = fill_commpage_entry( @@ -1169,10 +1159,7 @@ arch_cpu_init_post_modules(kernel_args* args) // add the functions to the commpage image image_id image = get_commpage_image(); - elf_add_memory_image_symbol(image, "commpage_memcpy", memcpyPosition, - memcpyLen, B_SYMBOL_TYPE_TEXT); - elf_add_memory_image_symbol(image, "commpage_memset", memsetPosition, - memsetLen, B_SYMBOL_TYPE_TEXT); + elf_add_memory_image_symbol(image, "commpage_thread_exit", threadExitPosition, threadExitLen, B_SYMBOL_TYPE_TEXT); diff --git a/src/system/kernel/lib/arch/x86_64/Jamfile b/src/system/kernel/lib/arch/x86_64/Jamfile index 7f06e09e3a..67a3890ac0 100644 --- a/src/system/kernel/lib/arch/x86_64/Jamfile +++ b/src/system/kernel/lib/arch/x86_64/Jamfile @@ -21,6 +21,7 @@ KernelMergeObject kernel_os_arch_$(TARGET_ARCH).o : ; SEARCH_SOURCE += [ FDirName $(posixSources) arch $(TARGET_ARCH) ] ; +SEARCH_SOURCE += [ FDirName $(posixSources) string arch $(TARGET_ARCH) ] ; KernelMergeObject kernel_lib_posix_arch_$(TARGET_ARCH).o : siglongjmp.S @@ -28,12 +29,8 @@ KernelMergeObject kernel_lib_posix_arch_$(TARGET_ARCH).o : kernel_longjmp_return.c kernel_setjmp_save_sigs.c - arch_string.S + arch_string.cpp : $(TARGET_KERNEL_PIC_CCFLAGS) ; -# Explicitly tell the build system that arch_string.S includes the generated -# asm_offsets.h. -Includes [ FGristFiles arch_string.S ] - : asm_offsets.h ; diff --git a/src/system/kernel/lib/arch/x86_64/arch_string.S b/src/system/kernel/lib/arch/x86_64/arch_string.S deleted file mode 100644 index f0849e8fd8..0000000000 --- a/src/system/kernel/lib/arch/x86_64/arch_string.S +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2012, Alex Smith, alex@alex-smith.me.uk. - * Distributed under the terms of the MIT License. - */ - - -#include - - -.align 8 -FUNCTION(memcpy): - push %rbp - movq %rsp, %rbp - - // Preserve original destination address for return value. - movq %rdi, %rax - - // size -> %rcx - movq %rdx, %rcx - - // For small copies, always do it bytewise, the additional overhead is - // not worth it. - cmp $24, %rcx - jl .Lmemcpy_generic_byte_copy - - // Do both source and dest have the same alignment? - movq %rsi, %r8 - xorq %rdi, %r8 - test $7, %r8 - jnz .Lmemcpy_generic_byte_copy - - // Align up to an 8-byte boundary. - movq %rdi, %r8 - andq $7, %r8 - jz .Lmemcpy_generic_qword_copy - movq $8, %rcx - subq %r8, %rcx - subq %rcx, %rdx // Subtract from the overall count. - rep - movsb - - // Get back the original count value. - movq %rdx, %rcx -.Lmemcpy_generic_qword_copy: - // Move by quadwords. - shrq $3, %rcx - rep - movsq - - // Get the remaining count. - movq %rdx, %rcx - andq $7, %rcx -.Lmemcpy_generic_byte_copy: - // Move any remaining data by bytes. - rep - movsb - - pop %rbp - ret -FUNCTION_END(memcpy) -SYMBOL(memcpy_end): - - -.align 8 -FUNCTION(memset): - push %rbp - movq %rsp, %rbp - - // Preserve original destination address for return value. - movq %rdi, %r8 - - // size -> %rcx, value -> %al - movq %rdx, %rcx - movl %esi, %eax - - // Move by bytes. - rep - stosb - - movq %r8, %rax - pop %rbp - ret -FUNCTION_END(memset) -SYMBOL(memset_end): - diff --git a/src/system/libroot/posix/string/arch/x86_64/Jamfile b/src/system/libroot/posix/string/arch/x86_64/Jamfile index 6d843894d4..b8cd49040d 100644 --- a/src/system/libroot/posix/string/arch/x86_64/Jamfile +++ b/src/system/libroot/posix/string/arch/x86_64/Jamfile @@ -1,5 +1,7 @@ SubDir HAIKU_TOP src system libroot posix string arch x86_64 ; +SubDirC++Flags -std=gnu++11 ; + local architectureObject ; for architectureObject in [ MultiArchSubDirSetup x86_64 ] { on $(architectureObject) { @@ -8,7 +10,7 @@ for architectureObject in [ MultiArchSubDirSetup x86_64 ] { UsePrivateSystemHeaders ; MergeObject <$(architecture)>posix_string_arch_$(TARGET_ARCH).o : - arch_string.S + arch_string.cpp ; } } diff --git a/src/system/libroot/posix/string/arch/x86_64/arch_string.S b/src/system/libroot/posix/string/arch/x86_64/arch_string.S deleted file mode 100644 index e1273fdc3c..0000000000 --- a/src/system/libroot/posix/string/arch/x86_64/arch_string.S +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2008, Ingo Weinhold, ingo_weinhold@gmx.de. - * Distributed under the terms of the MIT License. - */ - -#include -#include - - -FUNCTION(memcpy): - movq __gCommPageAddress@GOTPCREL(%rip), %rax - movq (%rax), %rax - addq 8 * COMMPAGE_ENTRY_X86_MEMCPY(%rax), %rax - jmp *%rax -FUNCTION_END(memcpy) - -FUNCTION(memset): - movq __gCommPageAddress@GOTPCREL(%rip), %rax - movq (%rax), %rax - addq 8 * COMMPAGE_ENTRY_X86_MEMSET(%rax), %rax - jmp *%rax -FUNCTION_END(memset) diff --git a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp new file mode 100644 index 0000000000..b83376c331 --- /dev/null +++ b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp @@ -0,0 +1,31 @@ +/* + * Copyright 2014, Paweł Dziepak, pdziepak@quarnos.org. + * Distributed under the terms of the MIT License. + */ + + +#include + + +extern "C" void* +memcpy(void* destination, const void* source, size_t length) +{ + auto returnValue = destination; + __asm__ __volatile__("rep movsb" + : "+D" (destination), "+S" (source), "+c" (length) + : : "memory"); + return returnValue; +} + + +extern "C" void* +memset(void* destination, int value, size_t length) +{ + auto returnValue = destination; + __asm__ __volatile__("rep stosb" + : "+D" (destination), "+c" (length) + : "a" (value) + : "memory"); + return returnValue; +} + From acad7bf64ac7be7ed3f83437efeac0f92d681e01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Sun, 14 Sep 2014 19:07:40 +0200 Subject: [PATCH 3/8] kernel/x86_64: make sure stack is properly aligned in syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just following the path of least resistance and adding andq $~15, %rsp where appropriate. That should also make things harder to break when changing the amount of stuff placed on stack before calling the actual syscall routine. Signed-off-by: Paweł Dziepak --- src/system/kernel/arch/x86/64/interrupts.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/system/kernel/arch/x86/64/interrupts.S b/src/system/kernel/arch/x86/64/interrupts.S index db73a4fba4..a63cd009b8 100644 --- a/src/system/kernel/arch/x86/64/interrupts.S +++ b/src/system/kernel/arch/x86/64/interrupts.S @@ -315,6 +315,7 @@ FUNCTION(x86_64_syscall_entry): // Frame pointer is the iframe. movq %rsp, %rbp + andq $~15, %rsp // Preserve call number (R14 is callee-save), get thread pointer. movq %rax, %r14 @@ -367,10 +368,10 @@ FUNCTION(x86_64_syscall_entry): // TODO: post-syscall tracing +.Lsyscall_return: // Restore the original stack pointer and return. movq %rbp, %rsp -.Lsyscall_return: // Clear the restarted flag. testl $THREAD_FLAGS_SYSCALL_RESTARTED, THREAD_flags(%r12) jz 2f @@ -493,6 +494,7 @@ FUNCTION(x86_64_syscall_entry): // Make space on the stack. subq %rcx, %rsp + andq $~15, %rsp movq %rsp, %rdi // Set a fault handler. From b41f281071b84235ea911f1e02123692798f706d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Sat, 6 Sep 2014 19:29:11 +0200 Subject: [PATCH 4/8] boot/x86_64: enable sse early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable SSE as a part of the "preparation of the environment to run any C or C++ code" in the entry points of stage2 bootloader. SSE2 is going to be used by memset() and memcpy(). Signed-off-by: Paweł Dziepak --- headers/private/kernel/arch/x86/arch_cpu.h | 8 ++++++++ src/system/boot/platform/bios_ia32/long.cpp | 11 +++++++++++ src/system/kernel/arch/x86/arch_cpu.cpp | 8 -------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/headers/private/kernel/arch/x86/arch_cpu.h b/headers/private/kernel/arch/x86/arch_cpu.h index 800f489300..c6d4c11c5e 100644 --- a/headers/private/kernel/arch/x86/arch_cpu.h +++ b/headers/private/kernel/arch/x86/arch_cpu.h @@ -243,6 +243,14 @@ | X86_EFLAGS_AUXILIARY_CARRY | X86_EFLAGS_ZERO | X86_EFLAGS_SIGN \ | X86_EFLAGS_DIRECTION | X86_EFLAGS_OVERFLOW) +#define CR0_CACHE_DISABLE (1UL << 30) +#define CR0_NOT_WRITE_THROUGH (1UL << 29) +#define CR0_FPU_EMULATION (1UL << 2) +#define CR0_MONITOR_FPU (1UL << 1) + +#define CR4_OS_FXSR (1UL << 9) +#define CR4_OS_XMM_EXCEPTION (1UL << 10) + // iframe types #define IFRAME_TYPE_SYSCALL 0x1 diff --git a/src/system/boot/platform/bios_ia32/long.cpp b/src/system/boot/platform/bios_ia32/long.cpp index 8531716198..e558bae6f8 100644 --- a/src/system/boot/platform/bios_ia32/long.cpp +++ b/src/system/boot/platform/bios_ia32/long.cpp @@ -278,6 +278,14 @@ convert_kernel_args() } +static void +enable_sse() +{ + x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION); + x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU)); +} + + static void long_smp_start_kernel(void) { @@ -287,6 +295,7 @@ long_smp_start_kernel(void) asm("movl %%eax, %%cr0" : : "a" ((1 << 31) | (1 << 16) | (1 << 5) | 1)); asm("cld"); asm("fninit"); + enable_sse(); // Fix our kernel stack address. gKernelArgs.cpu_kstack[cpu].start @@ -308,6 +317,8 @@ long_start_kernel() if ((info.regs.edx & (1 << 29)) == 0) panic("64-bit kernel requires a 64-bit CPU"); + enable_sse(); + preloaded_elf64_image *image = static_cast( gKernelArgs.kernel_image.Pointer()); diff --git a/src/system/kernel/arch/x86/arch_cpu.cpp b/src/system/kernel/arch/x86/arch_cpu.cpp index 8a2f7832d1..6cd2628ab5 100644 --- a/src/system/kernel/arch/x86/arch_cpu.cpp +++ b/src/system/kernel/arch/x86/arch_cpu.cpp @@ -59,14 +59,6 @@ static const struct cpu_vendor_info vendor_info[VENDOR_NUM] = { { "NSC", { "Geode by NSC" } }, }; -#define CR0_CACHE_DISABLE (1UL << 30) -#define CR0_NOT_WRITE_THROUGH (1UL << 29) -#define CR0_FPU_EMULATION (1UL << 2) -#define CR0_MONITOR_FPU (1UL << 1) - -#define CR4_OS_FXSR (1UL << 9) -#define CR4_OS_XMM_EXCEPTION (1UL << 10) - #define K8_SMIONCMPHALT (1ULL << 27) #define K8_C1EONCMPHALT (1ULL << 28) From 396b74228eefcf4bc21333e05c1909b8692d1b86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Wed, 10 Sep 2014 20:48:35 +0200 Subject: [PATCH 5/8] kernel/x86_64: save fpu state at interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel is allowed to use fpu anywhere so we must make sure that user state is not clobbered by saving fpu state at interrupt entry. There is no need to do that in case of system calls since all fpu data registers are caller saved. We do not need, though, to save the whole fpu state at task swich (again, thanks to calling convention). Only status and control registers are preserved. This patch actually adds xmm0-15 register to clobber list of task swich code, but the only reason of that is to make sure that nothing bad happens inside the function that executes that task swich. Inspection of the generated code shows that no xmm registers are actually saved. Signed-off-by: Paweł Dziepak --- headers/private/kernel/arch/x86/64/cpu.h | 10 ++- headers/private/kernel/arch/x86/64/iframe.h | 1 + headers/private/kernel/arch/x86/arch_cpu.h | 11 +-- src/system/kernel/arch/x86/64/arch.S | 29 -------- src/system/kernel/arch/x86/64/interrupts.S | 68 ++++++++++++++++--- src/system/kernel/arch/x86/64/thread.cpp | 31 +++++---- src/system/kernel/arch/x86/arch_cpu.cpp | 5 +- src/system/kernel/arch/x86/arch_thread.cpp | 4 ++ .../kernel/arch/x86/arch_user_debugger.cpp | 23 +++++-- src/system/kernel/arch/x86/asm_offsets.cpp | 1 + 10 files changed, 121 insertions(+), 62 deletions(-) diff --git a/headers/private/kernel/arch/x86/64/cpu.h b/headers/private/kernel/arch/x86/64/cpu.h index a29891d5de..02860371b8 100644 --- a/headers/private/kernel/arch/x86/64/cpu.h +++ b/headers/private/kernel/arch/x86/64/cpu.h @@ -28,6 +28,10 @@ x86_write_msr(uint32_t msr, uint64_t value) static inline void x86_context_switch(arch_thread* oldState, arch_thread* newState) { + uint16_t fpuControl; + asm volatile("fnstcw %0" : "=m" (fpuControl)); + uint32_t sseControl; + asm volatile("stmxcsr %0" : "=m" (sseControl)); asm volatile( "pushq %%rbp;" "movq $1f, %c[rip](%0);" @@ -41,7 +45,11 @@ x86_context_switch(arch_thread* oldState, arch_thread* newState) [rsp] "i" (offsetof(arch_thread, current_stack)), [rip] "i" (offsetof(arch_thread, instruction_pointer)) : "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", - "r14", "r15", "memory"); + "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", + "xmm14", "xmm15", "memory"); + asm volatile("ldmxcsr %0" : : "m" (sseControl)); + asm volatile("fldcw %0" : : "m" (fpuControl)); } diff --git a/headers/private/kernel/arch/x86/64/iframe.h b/headers/private/kernel/arch/x86/64/iframe.h index 2320949c5b..d4b61829db 100644 --- a/headers/private/kernel/arch/x86/64/iframe.h +++ b/headers/private/kernel/arch/x86/64/iframe.h @@ -8,6 +8,7 @@ struct iframe { uint64 type; + void* fpu; uint64 r15; uint64 r14; uint64 r13; diff --git a/headers/private/kernel/arch/x86/arch_cpu.h b/headers/private/kernel/arch/x86/arch_cpu.h index c6d4c11c5e..a202d08f4a 100644 --- a/headers/private/kernel/arch/x86/arch_cpu.h +++ b/headers/private/kernel/arch/x86/arch_cpu.h @@ -454,10 +454,7 @@ void __x86_setup_system_time(uint32 conversionFactor, void x86_userspace_thread_exit(void); void x86_end_userspace_thread_exit(void); -void x86_fxsave(void* fpuState); -void x86_fxrstor(const void* fpuState); -void x86_noop_swap(void* oldFpuState, const void* newFpuState); -void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); + addr_t x86_get_stack_frame(); uint32 x86_count_mtrrs(void); void x86_set_mtrr(uint32 index, uint64 base, uint64 length, uint8 type); @@ -488,7 +485,13 @@ void x86_context_switch(struct arch_thread* oldState, void x86_fnsave(void* fpuState); void x86_frstor(const void* fpuState); + +void x86_fxsave(void* fpuState); +void x86_fxrstor(const void* fpuState); + +void x86_noop_swap(void* oldFpuState, const void* newFpuState); void x86_fnsave_swap(void* oldFpuState, const void* newFpuState); +void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); #endif diff --git a/src/system/kernel/arch/x86/64/arch.S b/src/system/kernel/arch/x86/64/arch.S index 4cdd22af84..19c74b4d2d 100644 --- a/src/system/kernel/arch/x86/64/arch.S +++ b/src/system/kernel/arch/x86/64/arch.S @@ -19,35 +19,6 @@ .text -/* void x86_fxsave(void* fpuState); */ -FUNCTION(x86_fxsave): - fxsave (%rdi) - ret -FUNCTION_END(x86_fxsave) - - -/* void x86_fxrstor(const void* fpuState); */ -FUNCTION(x86_fxrstor): - fxrstor (%rdi) - ret -FUNCTION_END(x86_fxrstor) - - -/* void x86_noop_swap(void *oldFpuState, const void *newFpuState); */ -FUNCTION(x86_noop_swap): - nop - ret -FUNCTION_END(x86_noop_swap) - - -/* void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); */ -FUNCTION(x86_fxsave_swap): - fxsave (%rdi) - fxrstor (%rsi) - ret -FUNCTION_END(x86_fxsave_swap) - - /* addr_t x86_get_stack_frame(); */ FUNCTION(x86_get_stack_frame): mov %rbp, %rax diff --git a/src/system/kernel/arch/x86/64/interrupts.S b/src/system/kernel/arch/x86/64/interrupts.S index a63cd009b8..50ba3ea384 100644 --- a/src/system/kernel/arch/x86/64/interrupts.S +++ b/src/system/kernel/arch/x86/64/interrupts.S @@ -35,11 +35,13 @@ push %r13; \ push %r14; \ push %r15; \ + pushq $0; \ push $iframeType; + // Restore the interrupt frame. #define RESTORE_IFRAME() \ - add $8, %rsp; \ + add $16, %rsp; \ pop %r15; \ pop %r14; \ pop %r13; \ @@ -198,11 +200,18 @@ STATIC_FUNCTION(int_bottom): // exception. orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp) + subq $512, %rsp + andq $~15, %rsp + fxsaveq (%rsp) + // Call the interrupt handler. - movq %rsp, %rdi - movq IFRAME_vector(%rsp), %rax + movq %rbp, %rdi + movq IFRAME_vector(%rbp), %rax call *gInterruptHandlerTable(, %rax, 8) + fxrstorq (%rsp) + movq %rbp, %rsp + // Restore the saved registers. RESTORE_IFRAME() @@ -217,12 +226,16 @@ STATIC_FUNCTION(int_bottom_user): // Push the rest of the interrupt frame to the stack. PUSH_IFRAME_BOTTOM(IFRAME_TYPE_OTHER) - cld // Frame pointer is the iframe. movq %rsp, %rbp + subq $512, %rsp + andq $~15, %rsp + fxsaveq (%rsp) + movq %rsp, IFRAME_fpu(%rbp) + // Set the RF (resume flag) in RFLAGS. This prevents an instruction // breakpoint on the instruction we're returning to to trigger a debug // exception. @@ -235,8 +248,8 @@ STATIC_FUNCTION(int_bottom_user): UPDATE_THREAD_USER_TIME() // Call the interrupt handler. - movq %rsp, %rdi - movq IFRAME_vector(%rsp), %rax + movq %rbp, %rdi + movq IFRAME_vector(%rbp), %rax call *gInterruptHandlerTable(, %rax, 8) // If there are no signals pending or we're not debugging, we can avoid @@ -250,6 +263,9 @@ STATIC_FUNCTION(int_bottom_user): UPDATE_THREAD_KERNEL_TIME() + fxrstorq (%rsp) + movq %rbp, %rsp + // Restore the saved registers. RESTORE_IFRAME() @@ -274,6 +290,9 @@ STATIC_FUNCTION(int_bottom_user): movq %rbp, %rdi call x86_init_user_debug_at_kernel_exit 1: + fxrstorq (%rsp) + movq %rbp, %rsp + // Restore the saved registers. RESTORE_IFRAME() @@ -395,7 +414,7 @@ FUNCTION(x86_64_syscall_entry): // If we've just restored a signal frame, use the IRET path. cmpq $SYSCALL_RESTORE_SIGNAL_FRAME, %r14 - je .Liret + je .Lrestore_fpu // Restore the iframe and RCX/R11 for SYSRET. RESTORE_IFRAME() @@ -466,7 +485,11 @@ FUNCTION(x86_64_syscall_entry): // On this return path it is possible that the frame has been modified, // for example to execute a signal handler. In this case it is safer to // return via IRET. + jmp .Liret +.Lrestore_fpu: + movq IFRAME_fpu(%rbp), %rax + fxrstorq (%rax) .Liret: // Restore the saved registers. RESTORE_IFRAME() @@ -537,7 +560,7 @@ FUNCTION(x86_return_to_userland): testl $(THREAD_FLAGS_DEBUGGER_INSTALLED | THREAD_FLAGS_SIGNALS_PENDING \ | THREAD_FLAGS_DEBUG_THREAD | THREAD_FLAGS_BREAKPOINTS_DEFINED) \ , THREAD_flags(%r12) - jnz .Lkernel_exit_work + jnz .Luserland_return_work // update the thread's kernel time and return UPDATE_THREAD_KERNEL_TIME() @@ -546,4 +569,33 @@ FUNCTION(x86_return_to_userland): RESTORE_IFRAME() swapgs iretq +.Luserland_return_work: + // Slow path for return to userland. + + // Do we need to handle signals? + testl $(THREAD_FLAGS_SIGNALS_PENDING | THREAD_FLAGS_DEBUG_THREAD) \ + , THREAD_flags(%r12) + jnz .Luserland_return_handle_signals + cli + call thread_at_kernel_exit_no_signals + +.Luserland_return_work_done: + // Install breakpoints, if defined. + testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12) + jz 1f + movq %rbp, %rdi + call x86_init_user_debug_at_kernel_exit +1: + // Restore the saved registers. + RESTORE_IFRAME() + + // Restore the previous GS base and return. + swapgs + iretq +.Luserland_return_handle_signals: + // thread_at_kernel_exit requires interrupts to be enabled, it will disable + // them after. + sti + call thread_at_kernel_exit + jmp .Luserland_return_work_done FUNCTION_END(x86_return_to_userland) diff --git a/src/system/kernel/arch/x86/64/thread.cpp b/src/system/kernel/arch/x86/64/thread.cpp index 961a2534c1..9e535fefbe 100644 --- a/src/system/kernel/arch/x86/64/thread.cpp +++ b/src/system/kernel/arch/x86/64/thread.cpp @@ -134,9 +134,12 @@ arch_thread_init(kernel_args* args) { // Save one global valid FPU state; it will be copied in the arch dependent // part of each new thread. - asm volatile ("clts; fninit; fnclex;"); - x86_fxsave(sInitialState.fpu_state); - + asm volatile ( + "clts;" \ + "fninit;" \ + "fnclex;" \ + "fxsave %0;" + : "=m" (sInitialState.fpu_state)); return B_OK; } @@ -296,15 +299,14 @@ arch_setup_signal_frame(Thread* thread, struct sigaction* action, signalFrameData->context.uc_mcontext.rip = frame->ip; signalFrameData->context.uc_mcontext.rflags = frame->flags; - // Store the FPU state. There appears to be a bug in GCC where the aligned - // attribute on a structure is being ignored when the structure is allocated - // on the stack, so even if the fpu_state struct has aligned(16) it may not - // get aligned correctly. Instead, use the current thread's FPU save area - // and then memcpy() to the frame structure. - x86_fxsave(thread->arch_info.fpu_state); - memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, - thread->arch_info.fpu_state, - sizeof(signalFrameData->context.uc_mcontext.fpu)); + if (frame->fpu != nullptr) { + memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, frame->fpu, + sizeof(signalFrameData->context.uc_mcontext.fpu)); + } else { + memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, + sInitialState.fpu_state, + sizeof(signalFrameData->context.uc_mcontext.fpu)); + } // Fill in signalFrameData->context.uc_stack. signal_get_user_stack(frame->user_sp, &signalFrameData->context.uc_stack); @@ -370,13 +372,12 @@ arch_restore_signal_frame(struct signal_frame_data* signalFrameData) frame->flags = (frame->flags & ~(uint64)X86_EFLAGS_USER_FLAGS) | (signalFrameData->context.uc_mcontext.rflags & X86_EFLAGS_USER_FLAGS); - // Same as above, alignment may not be correct. Copy to thread and restore - // from there. Thread* thread = thread_get_current_thread(); + memcpy(thread->arch_info.fpu_state, (void*)&signalFrameData->context.uc_mcontext.fpu, sizeof(thread->arch_info.fpu_state)); - x86_fxrstor(thread->arch_info.fpu_state); + frame->fpu = &thread->arch_info.fpu_state; // The syscall return code overwrites frame->ax with the return value of // the syscall, need to return it here to ensure the correct value is diff --git a/src/system/kernel/arch/x86/arch_cpu.cpp b/src/system/kernel/arch/x86/arch_cpu.cpp index 6cd2628ab5..65ea7fce1c 100644 --- a/src/system/kernel/arch/x86/arch_cpu.cpp +++ b/src/system/kernel/arch/x86/arch_cpu.cpp @@ -82,8 +82,10 @@ extern "C" void x86_reboot(void); // from arch.S void (*gCpuIdleFunc)(void); +#ifndef __x86_64__ void (*gX86SwapFPUFunc)(void* oldState, const void* newState) = x86_noop_swap; bool gHasSSE = false; +#endif static uint32 sCpuRendezvous; static uint32 sCpuRendezvous2; @@ -318,13 +320,14 @@ x86_init_fpu(void) #endif dprintf("%s: CPU has SSE... enabling FXSR and XMM.\n", __func__); - +#ifndef __x86_64__ // enable OS support for SSE x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION); x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU)); gX86SwapFPUFunc = x86_fxsave_swap; gHasSSE = true; +#endif } diff --git a/src/system/kernel/arch/x86/arch_thread.cpp b/src/system/kernel/arch/x86/arch_thread.cpp index 25cde443ed..931c74b099 100644 --- a/src/system/kernel/arch/x86/arch_thread.cpp +++ b/src/system/kernel/arch/x86/arch_thread.cpp @@ -31,7 +31,9 @@ extern "C" void x86_return_to_userland(iframe* frame); // from arch_cpu.cpp +#ifndef __x86_64__ extern void (*gX86SwapFPUFunc)(void *oldState, const void *newState); +#endif static struct iframe* @@ -245,7 +247,9 @@ arch_thread_context_switch(Thread* from, Thread* to) activePagingStructures->RemoveReference(); } +#ifndef __x86_64__ gX86SwapFPUFunc(from->arch_info.fpu_state, to->arch_info.fpu_state); +#endif x86_context_switch(&from->arch_info, &to->arch_info); } diff --git a/src/system/kernel/arch/x86/arch_user_debugger.cpp b/src/system/kernel/arch/x86/arch_user_debugger.cpp index 516ed8f340..8b621d9e3c 100644 --- a/src/system/kernel/arch/x86/arch_user_debugger.cpp +++ b/src/system/kernel/arch/x86/arch_user_debugger.cpp @@ -33,7 +33,9 @@ // TODO: Make those real error codes. +#ifndef __x86_64__ extern bool gHasSSE; +#endif // The software breakpoint instruction (int3). const uint8 kX86SoftwareBreakpoint[1] = { 0xcc }; @@ -688,6 +690,12 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState) if (iframe* frame = x86_get_user_iframe()) { // For the floating point state to be correct the calling function must // not use these registers (not even indirectly). +#ifdef __x86_64__ + Thread* thread = thread_get_current_thread(); + memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers, + sizeof(cpuState->extended_registers)); + frame->fpu = &thread->arch_info.fpu_state; +#else if (gHasSSE) { // Since fxrstor requires 16-byte alignment and this isn't // guaranteed passed buffer, we use our thread's fpu_state field as @@ -698,12 +706,11 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState) memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers, sizeof(cpuState->extended_registers)); x86_fxrstor(thread->arch_info.fpu_state); -#ifndef __x86_64__ } else { // TODO: Implement! We need to convert the format first. // x86_frstor(&cpuState->extended_registers); -#endif } +#endif set_iframe_registers(frame, cpuState); } } @@ -715,6 +722,15 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState) if (iframe* frame = x86_get_user_iframe()) { // For the floating point state to be correct the calling function must // not use these registers (not even indirectly). +#ifdef __x86_64__ + if (frame->fpu != nullptr) { + memcpy(&cpuState->extended_registers, frame->fpu, + sizeof(cpuState->extended_registers)); + } else { + memset(&cpuState->extended_registers, 0, + sizeof(cpuState->extended_registers)); + } +#else if (gHasSSE) { // Since fxsave requires 16-byte alignment and this isn't guaranteed // passed buffer, we use our thread's fpu_state field as temporary @@ -725,15 +741,14 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState) // unlike fnsave, fxsave doesn't reinit the FPU state memcpy(&cpuState->extended_registers, thread->arch_info.fpu_state, sizeof(cpuState->extended_registers)); -#ifndef __x86_64__ } else { x86_fnsave(&cpuState->extended_registers); x86_frstor(&cpuState->extended_registers); // fnsave reinits the FPU state after saving, so we need to // load it again // TODO: Convert to fxsave format! -#endif } +#endif get_iframe_registers(frame, cpuState); } } diff --git a/src/system/kernel/arch/x86/asm_offsets.cpp b/src/system/kernel/arch/x86/asm_offsets.cpp index db89298604..d89200bf87 100644 --- a/src/system/kernel/arch/x86/asm_offsets.cpp +++ b/src/system/kernel/arch/x86/asm_offsets.cpp @@ -70,6 +70,7 @@ dummy() DEFINE_OFFSET_MACRO(IFRAME, iframe, r8); DEFINE_OFFSET_MACRO(IFRAME, iframe, r9); DEFINE_OFFSET_MACRO(IFRAME, iframe, r10); + DEFINE_OFFSET_MACRO(IFRAME, iframe, fpu); #else DEFINE_OFFSET_MACRO(IFRAME, iframe, orig_eax); #endif From 718fd007a635d32df8ca3ff5fe5e13f76a4ea041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Sun, 14 Sep 2014 19:02:27 +0200 Subject: [PATCH 6/8] kernel/x86_64: clear xmm0-15 registers on syscall exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As Alex pointed out we can leak possibly sensitive data in xmm registers when returning from the kernel. To prevent that xmm0-15 are zeroed before sysret or iret. The cost is negligible. Signed-off-by: Paweł Dziepak --- src/system/kernel/arch/x86/64/interrupts.S | 24 ++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/system/kernel/arch/x86/64/interrupts.S b/src/system/kernel/arch/x86/64/interrupts.S index 50ba3ea384..c94f0d8dc9 100644 --- a/src/system/kernel/arch/x86/64/interrupts.S +++ b/src/system/kernel/arch/x86/64/interrupts.S @@ -118,6 +118,23 @@ call x86_exit_user_debug_at_kernel_entry; \ 1: +#define CLEAR_FPU_STATE() \ + pxor %xmm0, %xmm0; \ + pxor %xmm1, %xmm1; \ + pxor %xmm2, %xmm2; \ + pxor %xmm3, %xmm3; \ + pxor %xmm4, %xmm4; \ + pxor %xmm5, %xmm5; \ + pxor %xmm6, %xmm6; \ + pxor %xmm7, %xmm7; \ + pxor %xmm8, %xmm8; \ + pxor %xmm9, %xmm9; \ + pxor %xmm10, %xmm10; \ + pxor %xmm11, %xmm11; \ + pxor %xmm12, %xmm12; \ + pxor %xmm13, %xmm13; \ + pxor %xmm14, %xmm14; \ + pxor %xmm15, %xmm15 // The following code defines the interrupt service routines for all 256 // interrupts. It creates a block of handlers, each 16 bytes, that the IDT @@ -416,6 +433,8 @@ FUNCTION(x86_64_syscall_entry): cmpq $SYSCALL_RESTORE_SIGNAL_FRAME, %r14 je .Lrestore_fpu + CLEAR_FPU_STATE() + // Restore the iframe and RCX/R11 for SYSRET. RESTORE_IFRAME() pop %rcx @@ -478,13 +497,14 @@ FUNCTION(x86_64_syscall_entry): 1: // Install breakpoints, if defined. testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12) - jz .Liret + jz 1f movq %rbp, %rdi call x86_init_user_debug_at_kernel_exit - +1: // On this return path it is possible that the frame has been modified, // for example to execute a signal handler. In this case it is safer to // return via IRET. + CLEAR_FPU_STATE() jmp .Liret .Lrestore_fpu: From 1d7b716f84b6cbed439a33aa4df78f3b0dfc279b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Sun, 7 Sep 2014 21:43:28 +0200 Subject: [PATCH 7/8] libroot/x86_64: new memset implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces new memset() implementation that improves the performance when the buffer is small. It was written for processors that support ERMSB, but performs reasonably well on older CPUs as well. The following benchmarks were done on Haswell i7 running Debian Jessie with Linux 3.16.1. In each iteration 64MB buffer was memset()ed, the parameter "size" is the size of the buffer passed in a single call (i.e. for "size: 2" memset() was called ~32 million times to memset the whole 64MB). f - original implementation, g - new implementation, all buffers 16 byte aligned set, size: 8, f: 66885 µs, g: 17768 µs, ∆: 73.44% set, size: 32, f: 17123 µs, g: 9163 µs, ∆: 46.49% set, size: 128, f: 6677 µs, g: 6919 µs, ∆: -3.62% set, size: 512, f: 11656 µs, g: 7715 µs, ∆: 33.81% set, size: 1024, f: 9156 µs, g: 7359 µs, ∆: 19.63% set, size: 4096, f: 4936 µs, g: 5159 µs, ∆: -4.52% f - glibc 2.19 implementation, g - new implementation, all buffers 16 byte aligned set, size: 8, f: 19631 µs, g: 17828 µs, ∆: 9.18% set, size: 32, f: 8545 µs, g: 9047 µs, ∆: -5.87% set, size: 128, f: 8304 µs, g: 6874 µs, ∆: 17.22% set, size: 512, f: 7373 µs, g: 7486 µs, ∆: -1.53% set, size: 1024, f: 9007 µs, g: 7344 µs, ∆: 18.46% set, size: 4096, f: 8169 µs, g: 5146 µs, ∆: 37.01% Apparently, glibc uses SSE even for large buffers and therefore does not takes advantage of ERMSB: set, size: 16384, f: 7007 µs, g: 3223 µs, ∆: 54.00% set, size: 32768, f: 6979 µs, g: 2930 µs, ∆: 58.02% set, size: 65536, f: 6907 µs, g: 2826 µs, ∆: 59.08% set, size: 131072, f: 6919 µs, g: 2752 µs, ∆: 60.23% The new implementation handles unaligned buffers quite well: f - glibc 2.19 implementation, g - new implementation, all buffers unaligned set, size: 16, f: 10045 µs, g: 10498 µs, ∆: -4.51% set, size: 32, f: 8590 µs, g: 9358 µs, ∆: -8.94% set, size: 64, f: 8618 µs, g: 8585 µs, ∆: 0.38% set, size: 128, f: 8393 µs, g: 6893 µs, ∆: 17.87% set, size: 256, f: 8042 µs, g: 7621 µs, ∆: 5.24% set, size: 512, f: 9661 µs, g: 7738 µs, ∆: 19.90% Signed-off-by: Paweł Dziepak --- .../posix/string/arch/x86_64/arch_string.cpp | 74 ++++++++++++++++++- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp index b83376c331..33fca22474 100644 --- a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp +++ b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp @@ -5,6 +5,9 @@ #include +#include + +#include extern "C" void* @@ -18,14 +21,77 @@ memcpy(void* destination, const void* source, size_t length) } -extern "C" void* -memset(void* destination, int value, size_t length) +static inline void +memset_repstos(uint8_t* destination, uint8_t value, size_t length) { - auto returnValue = destination; __asm__ __volatile__("rep stosb" : "+D" (destination), "+c" (length) : "a" (value) : "memory"); - return returnValue; +} + + +static inline void +memset_sse(uint8_t* destination, uint8_t value, size_t length) +{ + __m128i packed = _mm_set1_epi8(value); + auto end = reinterpret_cast<__m128i*>(destination + length - 16); + auto diff = reinterpret_cast(destination) % 16; + if (diff) { + diff = 16 - diff; + length -= diff; + _mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed); + } + auto ptr = reinterpret_cast<__m128i*>(destination + diff); + while (length >= 64) { + _mm_store_si128(ptr++, packed); + _mm_store_si128(ptr++, packed); + _mm_store_si128(ptr++, packed); + _mm_store_si128(ptr++, packed); + length -= 64; + } + while (length >= 16) { + _mm_store_si128(ptr++, packed); + length -= 16; + } + _mm_storeu_si128(end, packed); +} + + +static inline void +memset_small(uint8_t* destination, uint8_t value, size_t length) +{ + if (length >= 8) { + auto packed = value * 0x101010101010101ul; + auto ptr = reinterpret_cast(destination); + auto end = reinterpret_cast(destination + length - 8); + while (length >= 8) { + *ptr++ = packed; + length -= 8; + } + *end = packed; + } else { + while (length--) { + *destination++ = value; + } + } +} + + +extern "C" void* +memset(void* ptr, int chr, size_t length) +{ + auto value = static_cast(chr); + auto destination = static_cast(ptr); + if (length < 32) { + memset_small(destination, value, length); + return ptr; + } + if (length < 2048) { + memset_sse(destination, value, length); + return ptr; + } + memset_repstos(destination, value, length); + return ptr; } From 4582b6e3a3c960362d4f9f2f31b7c43887226fa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Wed, 10 Sep 2014 22:09:57 +0200 Subject: [PATCH 8/8] libroot/x86_64: new memcpy implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces new memcpy() implementation that improves the performance when the buffer is small. It was written for processors that support ERMSB, but performs reasonably well on older CPUs as well. The following benchmarks were done on Haswell i7 running Debian Jessie with Linux 3.16.1. In each iteration 64MB buffer was copied, the parameter "size" is the size of the buffer passed in a single call (i.e. for "size: 2" memcpy() was called ~32 million times to copy the whole 64MB). f - original implementation, g - new implementation, all buffers 16 byte aligned cpy, size: 8, f: 79971 µs, g: 20419 µs, ∆: 74.47% cpy, size: 32, f: 42068 µs, g: 12159 µs, ∆: 71.10% cpy, size: 128, f: 13408 µs, g: 10359 µs, ∆: 22.74% cpy, size: 512, f: 10634 µs, g: 10433 µs, ∆: 1.89% cpy, size: 1024, f: 10474 µs, g: 10536 µs, ∆: -0.59% cpy, size: 4096, f: 9419 µs, g: 8630 µs, ∆: 8.38% f - glibc 2.19 implementation, g - new implementation, all buffers 16 byte aligned cpy, size: 8, f: 26299 µs, g: 20919 µs, ∆: 20.46% cpy, size: 32, f: 11146 µs, g: 12159 µs, ∆: -9.09% cpy, size: 128, f: 10778 µs, g: 10354 µs, ∆: 3.93% cpy, size: 512, f: 12291 µs, g: 10426 µs, ∆: 15.17% cpy, size: 1024, f: 13923 µs, g: 10571 µs, ∆: 24.08% cpy, size: 4096, f: 11770 µs, g: 8671 µs, ∆: 26.33% f - glibc 2.19 implementation, g - new implementation, all buffers unaligned cpy, size: 16, f: 13376 µs, g: 13009 µs, ∆: 2.74% cpy, size: 32, f: 11130 µs, g: 12171 µs, ∆: -9.35% cpy, size: 64, f: 11017 µs, g: 11231 µs, ∆: -1.94% cpy, size: 128, f: 10884 µs, g: 10407 µs, ∆: 4.38% cpy, size: 256, f: 10826 µs, g: 10106 µs, ∆: 6.65% cpy, size: 512, f: 12354 µs, g: 10396 µs, ∆: 15.85% Signed-off-by: Paweł Dziepak --- .../posix/string/arch/x86_64/arch_string.cpp | 129 +++++++++++++++++- 1 file changed, 124 insertions(+), 5 deletions(-) diff --git a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp index 33fca22474..8639327b20 100644 --- a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp +++ b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp @@ -4,20 +4,139 @@ */ +#include + #include #include #include -extern "C" void* -memcpy(void* destination, const void* source, size_t length) +namespace { + + +template class Generator, unsigned N, unsigned ...Index> +struct GenerateTable : GenerateTable { +}; + +template class Generator, unsigned ...Index> +struct GenerateTable + : std::array::sValue), sizeof...(Index)> { + constexpr GenerateTable() + : + std::array::sValue), sizeof...(Index)> { + { Generator::sValue... } + } + { + } +}; + + +static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source, + size_t length) { - auto returnValue = destination; __asm__ __volatile__("rep movsb" : "+D" (destination), "+S" (source), "+c" (length) - : : "memory"); - return returnValue; + : + : "memory"); +} + + +template +inline void copy_small(uint8_t* destination, const uint8_t* source) +{ + struct data { + uint8_t x[N]; + }; + *reinterpret_cast(destination) + = *reinterpret_cast(source); +} + + +template +struct SmallGenerator { + constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small; +}; +constexpr static GenerateTable table_small; + + +static inline void memcpy_small(uint8_t* destination, const uint8_t* source, + size_t length) +{ + if (length < 8) { + table_small[length](destination, source); + } else { + auto to = reinterpret_cast(destination); + auto from = reinterpret_cast(source); + *to = *from; + to = reinterpret_cast(destination + length - 8); + from = reinterpret_cast(source + length - 8); + *to = *from; + } +} + + +template +inline void copy_sse(__m128i* destination, const __m128i* source) +{ + auto temp = _mm_loadu_si128(source); + _mm_storeu_si128(destination, temp); + copy_sse(destination + 1, source + 1); +} + + +template<> +inline void copy_sse<0>(__m128i* destination, const __m128i* source) +{ +} + + +template +struct SSEGenerator { + constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse; +}; +constexpr static GenerateTable table_sse; + + +static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length) +{ + auto to = reinterpret_cast<__m128i*>(destination); + auto from = reinterpret_cast(source); + auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16); + auto fromEnd = reinterpret_cast(source + length - 16); + while (length >= 64) { + copy_sse<4>(to, from); + to += 4; + from += 4; + length -= 64; + } + if (length >= 16) { + table_sse[length / 16](to, from); + length %= 16; + } + if (length) { + copy_sse<1>(toEnd, fromEnd); + } +} + + +} + + +extern "C" void* memcpy(void* destination, const void* source, size_t length) +{ + auto to = static_cast(destination); + auto from = static_cast(source); + if (length <= 16) { + memcpy_small(to, from, length); + return destination; + } + if (length < 2048) { + memcpy_sse(to, from, length); + return destination; + } + memcpy_repmovs(to, from, length); + return destination; }