Merge branch 'memcpy-v3'

This is a major rework of how Haiku implements memset() and memcpy() on
x64. These functions are now removed from the commpage and reimplemented
in C++ using sse2 where it proved to be beneficial. That required some
serious changes in handling fpu state. Now, full fpu state is saved
at each interrupt, but not on task switch.

Some numbers: results of building targets: kernel libroot.so runtime_loader
and HaikuDepot on Intel i7 4770 with 16GB of memory.

real
user
sys

before:
1m54.367
7m40.617
0m58.641

1m33.922
8m12.362
1m0.852

1m32.922
8m10.509
1m1.006

1m31.947
8m12.596
1m1.353

after:
1m50.959
7m43.118
0m58.923

1m30.644
8m6.385
1m0.584

1m31.549
8m7.976
0m59.792

1m31.546
8m6.733
1m0.242
This commit is contained in:
Paweł Dziepak 2014-09-14 19:26:07 +02:00
commit e81b792e8f
19 changed files with 410 additions and 287 deletions

View File

@ -28,6 +28,10 @@ x86_write_msr(uint32_t msr, uint64_t value)
static inline void static inline void
x86_context_switch(arch_thread* oldState, arch_thread* newState) x86_context_switch(arch_thread* oldState, arch_thread* newState)
{ {
uint16_t fpuControl;
asm volatile("fnstcw %0" : "=m" (fpuControl));
uint32_t sseControl;
asm volatile("stmxcsr %0" : "=m" (sseControl));
asm volatile( asm volatile(
"pushq %%rbp;" "pushq %%rbp;"
"movq $1f, %c[rip](%0);" "movq $1f, %c[rip](%0);"
@ -41,7 +45,11 @@ x86_context_switch(arch_thread* oldState, arch_thread* newState)
[rsp] "i" (offsetof(arch_thread, current_stack)), [rsp] "i" (offsetof(arch_thread, current_stack)),
[rip] "i" (offsetof(arch_thread, instruction_pointer)) [rip] "i" (offsetof(arch_thread, instruction_pointer))
: "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", : "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13",
"r14", "r15", "memory"); "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
"xmm14", "xmm15", "memory");
asm volatile("ldmxcsr %0" : : "m" (sseControl));
asm volatile("fldcw %0" : : "m" (fpuControl));
} }

View File

@ -8,6 +8,7 @@
struct iframe { struct iframe {
uint64 type; uint64 type;
void* fpu;
uint64 r15; uint64 r15;
uint64 r14; uint64 r14;
uint64 r13; uint64 r13;

View File

@ -243,6 +243,14 @@
| X86_EFLAGS_AUXILIARY_CARRY | X86_EFLAGS_ZERO | X86_EFLAGS_SIGN \ | X86_EFLAGS_AUXILIARY_CARRY | X86_EFLAGS_ZERO | X86_EFLAGS_SIGN \
| X86_EFLAGS_DIRECTION | X86_EFLAGS_OVERFLOW) | X86_EFLAGS_DIRECTION | X86_EFLAGS_OVERFLOW)
#define CR0_CACHE_DISABLE (1UL << 30)
#define CR0_NOT_WRITE_THROUGH (1UL << 29)
#define CR0_FPU_EMULATION (1UL << 2)
#define CR0_MONITOR_FPU (1UL << 1)
#define CR4_OS_FXSR (1UL << 9)
#define CR4_OS_XMM_EXCEPTION (1UL << 10)
// iframe types // iframe types
#define IFRAME_TYPE_SYSCALL 0x1 #define IFRAME_TYPE_SYSCALL 0x1
@ -262,13 +270,6 @@ typedef struct x86_mtrr_info {
uint8 type; uint8 type;
} x86_mtrr_info; } x86_mtrr_info;
typedef struct x86_optimized_functions {
void (*memcpy)(void* dest, const void* source, size_t count);
void* memcpy_end;
void (*memset)(void* dest, int value, size_t count);
void* memset_end;
} x86_optimized_functions;
typedef struct x86_cpu_module_info { typedef struct x86_cpu_module_info {
module_info info; module_info info;
uint32 (*count_mtrrs)(void); uint32 (*count_mtrrs)(void);
@ -280,8 +281,6 @@ typedef struct x86_cpu_module_info {
uint8* _type); uint8* _type);
void (*set_mtrrs)(uint8 defaultType, const x86_mtrr_info* infos, void (*set_mtrrs)(uint8 defaultType, const x86_mtrr_info* infos,
uint32 count); uint32 count);
void (*get_optimized_functions)(x86_optimized_functions* functions);
} x86_cpu_module_info; } x86_cpu_module_info;
// features // features
@ -455,10 +454,7 @@ void __x86_setup_system_time(uint32 conversionFactor,
void x86_userspace_thread_exit(void); void x86_userspace_thread_exit(void);
void x86_end_userspace_thread_exit(void); void x86_end_userspace_thread_exit(void);
void x86_fxsave(void* fpuState);
void x86_fxrstor(const void* fpuState);
void x86_noop_swap(void* oldFpuState, const void* newFpuState);
void x86_fxsave_swap(void* oldFpuState, const void* newFpuState);
addr_t x86_get_stack_frame(); addr_t x86_get_stack_frame();
uint32 x86_count_mtrrs(void); uint32 x86_count_mtrrs(void);
void x86_set_mtrr(uint32 index, uint64 base, uint64 length, uint8 type); void x86_set_mtrr(uint32 index, uint64 base, uint64 length, uint8 type);
@ -489,7 +485,13 @@ void x86_context_switch(struct arch_thread* oldState,
void x86_fnsave(void* fpuState); void x86_fnsave(void* fpuState);
void x86_frstor(const void* fpuState); void x86_frstor(const void* fpuState);
void x86_fxsave(void* fpuState);
void x86_fxrstor(const void* fpuState);
void x86_noop_swap(void* oldFpuState, const void* newFpuState);
void x86_fnsave_swap(void* oldFpuState, const void* newFpuState); void x86_fnsave_swap(void* oldFpuState, const void* newFpuState);
void x86_fxsave_swap(void* oldFpuState, const void* newFpuState);
#endif #endif

View File

@ -9,11 +9,9 @@
# error Must not be included directly. Include <commpage_defs.h> instead! # error Must not be included directly. Include <commpage_defs.h> instead!
#endif #endif
#define COMMPAGE_ENTRY_X86_MEMCPY (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 0)
#define COMMPAGE_ENTRY_X86_MEMSET (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 1)
#define COMMPAGE_ENTRY_X86_SIGNAL_HANDLER \ #define COMMPAGE_ENTRY_X86_SIGNAL_HANDLER \
(COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 2) (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 0)
#define COMMPAGE_ENTRY_X86_THREAD_EXIT \ #define COMMPAGE_ENTRY_X86_THREAD_EXIT \
(COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 3) (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 1)
#endif /* _SYSTEM_ARCH_x86_64_COMMPAGE_DEFS_H */ #endif /* _SYSTEM_ARCH_x86_64_COMMPAGE_DEFS_H */

View File

@ -278,6 +278,14 @@ convert_kernel_args()
} }
static void
enable_sse()
{
x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION);
x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU));
}
static void static void
long_smp_start_kernel(void) long_smp_start_kernel(void)
{ {
@ -287,6 +295,7 @@ long_smp_start_kernel(void)
asm("movl %%eax, %%cr0" : : "a" ((1 << 31) | (1 << 16) | (1 << 5) | 1)); asm("movl %%eax, %%cr0" : : "a" ((1 << 31) | (1 << 16) | (1 << 5) | 1));
asm("cld"); asm("cld");
asm("fninit"); asm("fninit");
enable_sse();
// Fix our kernel stack address. // Fix our kernel stack address.
gKernelArgs.cpu_kstack[cpu].start gKernelArgs.cpu_kstack[cpu].start
@ -308,6 +317,8 @@ long_start_kernel()
if ((info.regs.edx & (1 << 29)) == 0) if ((info.regs.edx & (1 << 29)) == 0)
panic("64-bit kernel requires a 64-bit CPU"); panic("64-bit kernel requires a 64-bit CPU");
enable_sse();
preloaded_elf64_image *image = static_cast<preloaded_elf64_image *>( preloaded_elf64_image *image = static_cast<preloaded_elf64_image *>(
gKernelArgs.kernel_image.Pointer()); gKernelArgs.kernel_image.Pointer());

View File

@ -30,6 +30,10 @@ extern "C" void x86_sysenter();
void (*gX86SetSyscallStack)(addr_t stackTop) = NULL; void (*gX86SetSyscallStack)(addr_t stackTop) = NULL;
extern int memcpy_end;
extern int memset_end;
static bool static bool
all_cpus_have_feature(enum x86_feature_type type, int feature) all_cpus_have_feature(enum x86_feature_type type, int feature)
{ {
@ -109,8 +113,20 @@ x86_initialize_syscall(void)
addr_t position = fill_commpage_entry(COMMPAGE_ENTRY_X86_SYSCALL, addr_t position = fill_commpage_entry(COMMPAGE_ENTRY_X86_SYSCALL,
syscallCode, len); syscallCode, len);
// put the optimized functions into the commpage
size_t memcpyLen = (addr_t)&memcpy_end - (addr_t)memcpy;
addr_t memcpyPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMCPY,
(const void*)memcpy, memcpyLen);
size_t memsetLen = (addr_t)&memset_end - (addr_t)memset;
addr_t memsetPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMSET,
(const void*)memset, memsetLen);
// add syscall to the commpage image // add syscall to the commpage image
image_id image = get_commpage_image(); image_id image = get_commpage_image();
elf_add_memory_image_symbol(image, "commpage_memcpy", memcpyPosition,
memcpyLen, B_SYMBOL_TYPE_TEXT);
elf_add_memory_image_symbol(image, "commpage_memset", memsetPosition,
memsetLen, B_SYMBOL_TYPE_TEXT);
elf_add_memory_image_symbol(image, "commpage_syscall", position, len, elf_add_memory_image_symbol(image, "commpage_syscall", position, len,
B_SYMBOL_TYPE_TEXT); B_SYMBOL_TYPE_TEXT);
} }

View File

@ -19,35 +19,6 @@
.text .text
/* void x86_fxsave(void* fpuState); */
FUNCTION(x86_fxsave):
fxsave (%rdi)
ret
FUNCTION_END(x86_fxsave)
/* void x86_fxrstor(const void* fpuState); */
FUNCTION(x86_fxrstor):
fxrstor (%rdi)
ret
FUNCTION_END(x86_fxrstor)
/* void x86_noop_swap(void *oldFpuState, const void *newFpuState); */
FUNCTION(x86_noop_swap):
nop
ret
FUNCTION_END(x86_noop_swap)
/* void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); */
FUNCTION(x86_fxsave_swap):
fxsave (%rdi)
fxrstor (%rsi)
ret
FUNCTION_END(x86_fxsave_swap)
/* addr_t x86_get_stack_frame(); */ /* addr_t x86_get_stack_frame(); */
FUNCTION(x86_get_stack_frame): FUNCTION(x86_get_stack_frame):
mov %rbp, %rax mov %rbp, %rax

View File

@ -35,11 +35,13 @@
push %r13; \ push %r13; \
push %r14; \ push %r14; \
push %r15; \ push %r15; \
pushq $0; \
push $iframeType; push $iframeType;
// Restore the interrupt frame. // Restore the interrupt frame.
#define RESTORE_IFRAME() \ #define RESTORE_IFRAME() \
add $8, %rsp; \ add $16, %rsp; \
pop %r15; \ pop %r15; \
pop %r14; \ pop %r14; \
pop %r13; \ pop %r13; \
@ -116,6 +118,23 @@
call x86_exit_user_debug_at_kernel_entry; \ call x86_exit_user_debug_at_kernel_entry; \
1: 1:
#define CLEAR_FPU_STATE() \
pxor %xmm0, %xmm0; \
pxor %xmm1, %xmm1; \
pxor %xmm2, %xmm2; \
pxor %xmm3, %xmm3; \
pxor %xmm4, %xmm4; \
pxor %xmm5, %xmm5; \
pxor %xmm6, %xmm6; \
pxor %xmm7, %xmm7; \
pxor %xmm8, %xmm8; \
pxor %xmm9, %xmm9; \
pxor %xmm10, %xmm10; \
pxor %xmm11, %xmm11; \
pxor %xmm12, %xmm12; \
pxor %xmm13, %xmm13; \
pxor %xmm14, %xmm14; \
pxor %xmm15, %xmm15
// The following code defines the interrupt service routines for all 256 // The following code defines the interrupt service routines for all 256
// interrupts. It creates a block of handlers, each 16 bytes, that the IDT // interrupts. It creates a block of handlers, each 16 bytes, that the IDT
@ -198,11 +217,18 @@ STATIC_FUNCTION(int_bottom):
// exception. // exception.
orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp) orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp)
subq $512, %rsp
andq $~15, %rsp
fxsaveq (%rsp)
// Call the interrupt handler. // Call the interrupt handler.
movq %rsp, %rdi movq %rbp, %rdi
movq IFRAME_vector(%rsp), %rax movq IFRAME_vector(%rbp), %rax
call *gInterruptHandlerTable(, %rax, 8) call *gInterruptHandlerTable(, %rax, 8)
fxrstorq (%rsp)
movq %rbp, %rsp
// Restore the saved registers. // Restore the saved registers.
RESTORE_IFRAME() RESTORE_IFRAME()
@ -217,12 +243,16 @@ STATIC_FUNCTION(int_bottom_user):
// Push the rest of the interrupt frame to the stack. // Push the rest of the interrupt frame to the stack.
PUSH_IFRAME_BOTTOM(IFRAME_TYPE_OTHER) PUSH_IFRAME_BOTTOM(IFRAME_TYPE_OTHER)
cld cld
// Frame pointer is the iframe. // Frame pointer is the iframe.
movq %rsp, %rbp movq %rsp, %rbp
subq $512, %rsp
andq $~15, %rsp
fxsaveq (%rsp)
movq %rsp, IFRAME_fpu(%rbp)
// Set the RF (resume flag) in RFLAGS. This prevents an instruction // Set the RF (resume flag) in RFLAGS. This prevents an instruction
// breakpoint on the instruction we're returning to to trigger a debug // breakpoint on the instruction we're returning to to trigger a debug
// exception. // exception.
@ -235,8 +265,8 @@ STATIC_FUNCTION(int_bottom_user):
UPDATE_THREAD_USER_TIME() UPDATE_THREAD_USER_TIME()
// Call the interrupt handler. // Call the interrupt handler.
movq %rsp, %rdi movq %rbp, %rdi
movq IFRAME_vector(%rsp), %rax movq IFRAME_vector(%rbp), %rax
call *gInterruptHandlerTable(, %rax, 8) call *gInterruptHandlerTable(, %rax, 8)
// If there are no signals pending or we're not debugging, we can avoid // If there are no signals pending or we're not debugging, we can avoid
@ -250,6 +280,9 @@ STATIC_FUNCTION(int_bottom_user):
UPDATE_THREAD_KERNEL_TIME() UPDATE_THREAD_KERNEL_TIME()
fxrstorq (%rsp)
movq %rbp, %rsp
// Restore the saved registers. // Restore the saved registers.
RESTORE_IFRAME() RESTORE_IFRAME()
@ -274,6 +307,9 @@ STATIC_FUNCTION(int_bottom_user):
movq %rbp, %rdi movq %rbp, %rdi
call x86_init_user_debug_at_kernel_exit call x86_init_user_debug_at_kernel_exit
1: 1:
fxrstorq (%rsp)
movq %rbp, %rsp
// Restore the saved registers. // Restore the saved registers.
RESTORE_IFRAME() RESTORE_IFRAME()
@ -315,6 +351,7 @@ FUNCTION(x86_64_syscall_entry):
// Frame pointer is the iframe. // Frame pointer is the iframe.
movq %rsp, %rbp movq %rsp, %rbp
andq $~15, %rsp
// Preserve call number (R14 is callee-save), get thread pointer. // Preserve call number (R14 is callee-save), get thread pointer.
movq %rax, %r14 movq %rax, %r14
@ -367,10 +404,10 @@ FUNCTION(x86_64_syscall_entry):
// TODO: post-syscall tracing // TODO: post-syscall tracing
.Lsyscall_return:
// Restore the original stack pointer and return. // Restore the original stack pointer and return.
movq %rbp, %rsp movq %rbp, %rsp
.Lsyscall_return:
// Clear the restarted flag. // Clear the restarted flag.
testl $THREAD_FLAGS_SYSCALL_RESTARTED, THREAD_flags(%r12) testl $THREAD_FLAGS_SYSCALL_RESTARTED, THREAD_flags(%r12)
jz 2f jz 2f
@ -394,7 +431,9 @@ FUNCTION(x86_64_syscall_entry):
// If we've just restored a signal frame, use the IRET path. // If we've just restored a signal frame, use the IRET path.
cmpq $SYSCALL_RESTORE_SIGNAL_FRAME, %r14 cmpq $SYSCALL_RESTORE_SIGNAL_FRAME, %r14
je .Liret je .Lrestore_fpu
CLEAR_FPU_STATE()
// Restore the iframe and RCX/R11 for SYSRET. // Restore the iframe and RCX/R11 for SYSRET.
RESTORE_IFRAME() RESTORE_IFRAME()
@ -458,14 +497,19 @@ FUNCTION(x86_64_syscall_entry):
1: 1:
// Install breakpoints, if defined. // Install breakpoints, if defined.
testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12) testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12)
jz .Liret jz 1f
movq %rbp, %rdi movq %rbp, %rdi
call x86_init_user_debug_at_kernel_exit call x86_init_user_debug_at_kernel_exit
1:
// On this return path it is possible that the frame has been modified, // On this return path it is possible that the frame has been modified,
// for example to execute a signal handler. In this case it is safer to // for example to execute a signal handler. In this case it is safer to
// return via IRET. // return via IRET.
CLEAR_FPU_STATE()
jmp .Liret
.Lrestore_fpu:
movq IFRAME_fpu(%rbp), %rax
fxrstorq (%rax)
.Liret: .Liret:
// Restore the saved registers. // Restore the saved registers.
RESTORE_IFRAME() RESTORE_IFRAME()
@ -493,6 +537,7 @@ FUNCTION(x86_64_syscall_entry):
// Make space on the stack. // Make space on the stack.
subq %rcx, %rsp subq %rcx, %rsp
andq $~15, %rsp
movq %rsp, %rdi movq %rsp, %rdi
// Set a fault handler. // Set a fault handler.
@ -535,7 +580,7 @@ FUNCTION(x86_return_to_userland):
testl $(THREAD_FLAGS_DEBUGGER_INSTALLED | THREAD_FLAGS_SIGNALS_PENDING \ testl $(THREAD_FLAGS_DEBUGGER_INSTALLED | THREAD_FLAGS_SIGNALS_PENDING \
| THREAD_FLAGS_DEBUG_THREAD | THREAD_FLAGS_BREAKPOINTS_DEFINED) \ | THREAD_FLAGS_DEBUG_THREAD | THREAD_FLAGS_BREAKPOINTS_DEFINED) \
, THREAD_flags(%r12) , THREAD_flags(%r12)
jnz .Lkernel_exit_work jnz .Luserland_return_work
// update the thread's kernel time and return // update the thread's kernel time and return
UPDATE_THREAD_KERNEL_TIME() UPDATE_THREAD_KERNEL_TIME()
@ -544,4 +589,33 @@ FUNCTION(x86_return_to_userland):
RESTORE_IFRAME() RESTORE_IFRAME()
swapgs swapgs
iretq iretq
.Luserland_return_work:
// Slow path for return to userland.
// Do we need to handle signals?
testl $(THREAD_FLAGS_SIGNALS_PENDING | THREAD_FLAGS_DEBUG_THREAD) \
, THREAD_flags(%r12)
jnz .Luserland_return_handle_signals
cli
call thread_at_kernel_exit_no_signals
.Luserland_return_work_done:
// Install breakpoints, if defined.
testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12)
jz 1f
movq %rbp, %rdi
call x86_init_user_debug_at_kernel_exit
1:
// Restore the saved registers.
RESTORE_IFRAME()
// Restore the previous GS base and return.
swapgs
iretq
.Luserland_return_handle_signals:
// thread_at_kernel_exit requires interrupts to be enabled, it will disable
// them after.
sti
call thread_at_kernel_exit
jmp .Luserland_return_work_done
FUNCTION_END(x86_return_to_userland) FUNCTION_END(x86_return_to_userland)

View File

@ -134,9 +134,12 @@ arch_thread_init(kernel_args* args)
{ {
// Save one global valid FPU state; it will be copied in the arch dependent // Save one global valid FPU state; it will be copied in the arch dependent
// part of each new thread. // part of each new thread.
asm volatile ("clts; fninit; fnclex;"); asm volatile (
x86_fxsave(sInitialState.fpu_state); "clts;" \
"fninit;" \
"fnclex;" \
"fxsave %0;"
: "=m" (sInitialState.fpu_state));
return B_OK; return B_OK;
} }
@ -296,15 +299,14 @@ arch_setup_signal_frame(Thread* thread, struct sigaction* action,
signalFrameData->context.uc_mcontext.rip = frame->ip; signalFrameData->context.uc_mcontext.rip = frame->ip;
signalFrameData->context.uc_mcontext.rflags = frame->flags; signalFrameData->context.uc_mcontext.rflags = frame->flags;
// Store the FPU state. There appears to be a bug in GCC where the aligned if (frame->fpu != nullptr) {
// attribute on a structure is being ignored when the structure is allocated memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, frame->fpu,
// on the stack, so even if the fpu_state struct has aligned(16) it may not sizeof(signalFrameData->context.uc_mcontext.fpu));
// get aligned correctly. Instead, use the current thread's FPU save area } else {
// and then memcpy() to the frame structure. memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
x86_fxsave(thread->arch_info.fpu_state); sInitialState.fpu_state,
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, sizeof(signalFrameData->context.uc_mcontext.fpu));
thread->arch_info.fpu_state, }
sizeof(signalFrameData->context.uc_mcontext.fpu));
// Fill in signalFrameData->context.uc_stack. // Fill in signalFrameData->context.uc_stack.
signal_get_user_stack(frame->user_sp, &signalFrameData->context.uc_stack); signal_get_user_stack(frame->user_sp, &signalFrameData->context.uc_stack);
@ -370,13 +372,12 @@ arch_restore_signal_frame(struct signal_frame_data* signalFrameData)
frame->flags = (frame->flags & ~(uint64)X86_EFLAGS_USER_FLAGS) frame->flags = (frame->flags & ~(uint64)X86_EFLAGS_USER_FLAGS)
| (signalFrameData->context.uc_mcontext.rflags & X86_EFLAGS_USER_FLAGS); | (signalFrameData->context.uc_mcontext.rflags & X86_EFLAGS_USER_FLAGS);
// Same as above, alignment may not be correct. Copy to thread and restore
// from there.
Thread* thread = thread_get_current_thread(); Thread* thread = thread_get_current_thread();
memcpy(thread->arch_info.fpu_state, memcpy(thread->arch_info.fpu_state,
(void*)&signalFrameData->context.uc_mcontext.fpu, (void*)&signalFrameData->context.uc_mcontext.fpu,
sizeof(thread->arch_info.fpu_state)); sizeof(thread->arch_info.fpu_state));
x86_fxrstor(thread->arch_info.fpu_state); frame->fpu = &thread->arch_info.fpu_state;
// The syscall return code overwrites frame->ax with the return value of // The syscall return code overwrites frame->ax with the return value of
// the syscall, need to return it here to ensure the correct value is // the syscall, need to return it here to ensure the correct value is

View File

@ -59,14 +59,6 @@ static const struct cpu_vendor_info vendor_info[VENDOR_NUM] = {
{ "NSC", { "Geode by NSC" } }, { "NSC", { "Geode by NSC" } },
}; };
#define CR0_CACHE_DISABLE (1UL << 30)
#define CR0_NOT_WRITE_THROUGH (1UL << 29)
#define CR0_FPU_EMULATION (1UL << 2)
#define CR0_MONITOR_FPU (1UL << 1)
#define CR4_OS_FXSR (1UL << 9)
#define CR4_OS_XMM_EXCEPTION (1UL << 10)
#define K8_SMIONCMPHALT (1ULL << 27) #define K8_SMIONCMPHALT (1ULL << 27)
#define K8_C1EONCMPHALT (1ULL << 28) #define K8_C1EONCMPHALT (1ULL << 28)
@ -90,8 +82,10 @@ extern "C" void x86_reboot(void);
// from arch.S // from arch.S
void (*gCpuIdleFunc)(void); void (*gCpuIdleFunc)(void);
#ifndef __x86_64__
void (*gX86SwapFPUFunc)(void* oldState, const void* newState) = x86_noop_swap; void (*gX86SwapFPUFunc)(void* oldState, const void* newState) = x86_noop_swap;
bool gHasSSE = false; bool gHasSSE = false;
#endif
static uint32 sCpuRendezvous; static uint32 sCpuRendezvous;
static uint32 sCpuRendezvous2; static uint32 sCpuRendezvous2;
@ -105,18 +99,6 @@ static const size_t kDoubleFaultStackSize = 4096; // size per CPU
static x86_cpu_module_info* sCpuModule; static x86_cpu_module_info* sCpuModule;
extern "C" void memcpy_generic(void* dest, const void* source, size_t count);
extern int memcpy_generic_end;
extern "C" void memset_generic(void* dest, int value, size_t count);
extern int memset_generic_end;
x86_optimized_functions gOptimizedFunctions = {
memcpy_generic,
&memcpy_generic_end,
memset_generic,
&memset_generic_end
};
/* CPU topology information */ /* CPU topology information */
static uint32 (*sGetCPUTopologyID)(int currentCPU); static uint32 (*sGetCPUTopologyID)(int currentCPU);
static uint32 sHierarchyMask[CPU_TOPOLOGY_LEVELS]; static uint32 sHierarchyMask[CPU_TOPOLOGY_LEVELS];
@ -338,13 +320,14 @@ x86_init_fpu(void)
#endif #endif
dprintf("%s: CPU has SSE... enabling FXSR and XMM.\n", __func__); dprintf("%s: CPU has SSE... enabling FXSR and XMM.\n", __func__);
#ifndef __x86_64__
// enable OS support for SSE // enable OS support for SSE
x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION); x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION);
x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU)); x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU));
gX86SwapFPUFunc = x86_fxsave_swap; gX86SwapFPUFunc = x86_fxsave_swap;
gHasSSE = true; gHasSSE = true;
#endif
} }
@ -1163,33 +1146,6 @@ arch_cpu_init_post_modules(kernel_args* args)
call_all_cpus(&init_mtrrs, NULL); call_all_cpus(&init_mtrrs, NULL);
} }
// get optimized functions from the CPU module
if (sCpuModule != NULL && sCpuModule->get_optimized_functions != NULL) {
x86_optimized_functions functions;
memset(&functions, 0, sizeof(functions));
sCpuModule->get_optimized_functions(&functions);
if (functions.memcpy != NULL) {
gOptimizedFunctions.memcpy = functions.memcpy;
gOptimizedFunctions.memcpy_end = functions.memcpy_end;
}
if (functions.memset != NULL) {
gOptimizedFunctions.memset = functions.memset;
gOptimizedFunctions.memset_end = functions.memset_end;
}
}
// put the optimized functions into the commpage
size_t memcpyLen = (addr_t)gOptimizedFunctions.memcpy_end
- (addr_t)gOptimizedFunctions.memcpy;
addr_t memcpyPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMCPY,
(const void*)gOptimizedFunctions.memcpy, memcpyLen);
size_t memsetLen = (addr_t)gOptimizedFunctions.memset_end
- (addr_t)gOptimizedFunctions.memset;
addr_t memsetPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMSET,
(const void*)gOptimizedFunctions.memset, memsetLen);
size_t threadExitLen = (addr_t)x86_end_userspace_thread_exit size_t threadExitLen = (addr_t)x86_end_userspace_thread_exit
- (addr_t)x86_userspace_thread_exit; - (addr_t)x86_userspace_thread_exit;
addr_t threadExitPosition = fill_commpage_entry( addr_t threadExitPosition = fill_commpage_entry(
@ -1198,10 +1154,7 @@ arch_cpu_init_post_modules(kernel_args* args)
// add the functions to the commpage image // add the functions to the commpage image
image_id image = get_commpage_image(); image_id image = get_commpage_image();
elf_add_memory_image_symbol(image, "commpage_memcpy", memcpyPosition,
memcpyLen, B_SYMBOL_TYPE_TEXT);
elf_add_memory_image_symbol(image, "commpage_memset", memsetPosition,
memsetLen, B_SYMBOL_TYPE_TEXT);
elf_add_memory_image_symbol(image, "commpage_thread_exit", elf_add_memory_image_symbol(image, "commpage_thread_exit",
threadExitPosition, threadExitLen, B_SYMBOL_TYPE_TEXT); threadExitPosition, threadExitLen, B_SYMBOL_TYPE_TEXT);

View File

@ -31,7 +31,9 @@
extern "C" void x86_return_to_userland(iframe* frame); extern "C" void x86_return_to_userland(iframe* frame);
// from arch_cpu.cpp // from arch_cpu.cpp
#ifndef __x86_64__
extern void (*gX86SwapFPUFunc)(void *oldState, const void *newState); extern void (*gX86SwapFPUFunc)(void *oldState, const void *newState);
#endif
static struct iframe* static struct iframe*
@ -245,7 +247,9 @@ arch_thread_context_switch(Thread* from, Thread* to)
activePagingStructures->RemoveReference(); activePagingStructures->RemoveReference();
} }
#ifndef __x86_64__
gX86SwapFPUFunc(from->arch_info.fpu_state, to->arch_info.fpu_state); gX86SwapFPUFunc(from->arch_info.fpu_state, to->arch_info.fpu_state);
#endif
x86_context_switch(&from->arch_info, &to->arch_info); x86_context_switch(&from->arch_info, &to->arch_info);
} }

View File

@ -33,7 +33,9 @@
// TODO: Make those real error codes. // TODO: Make those real error codes.
#ifndef __x86_64__
extern bool gHasSSE; extern bool gHasSSE;
#endif
// The software breakpoint instruction (int3). // The software breakpoint instruction (int3).
const uint8 kX86SoftwareBreakpoint[1] = { 0xcc }; const uint8 kX86SoftwareBreakpoint[1] = { 0xcc };
@ -688,6 +690,12 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState)
if (iframe* frame = x86_get_user_iframe()) { if (iframe* frame = x86_get_user_iframe()) {
// For the floating point state to be correct the calling function must // For the floating point state to be correct the calling function must
// not use these registers (not even indirectly). // not use these registers (not even indirectly).
#ifdef __x86_64__
Thread* thread = thread_get_current_thread();
memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers,
sizeof(cpuState->extended_registers));
frame->fpu = &thread->arch_info.fpu_state;
#else
if (gHasSSE) { if (gHasSSE) {
// Since fxrstor requires 16-byte alignment and this isn't // Since fxrstor requires 16-byte alignment and this isn't
// guaranteed passed buffer, we use our thread's fpu_state field as // guaranteed passed buffer, we use our thread's fpu_state field as
@ -698,12 +706,11 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState)
memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers, memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers,
sizeof(cpuState->extended_registers)); sizeof(cpuState->extended_registers));
x86_fxrstor(thread->arch_info.fpu_state); x86_fxrstor(thread->arch_info.fpu_state);
#ifndef __x86_64__
} else { } else {
// TODO: Implement! We need to convert the format first. // TODO: Implement! We need to convert the format first.
// x86_frstor(&cpuState->extended_registers); // x86_frstor(&cpuState->extended_registers);
#endif
} }
#endif
set_iframe_registers(frame, cpuState); set_iframe_registers(frame, cpuState);
} }
} }
@ -715,6 +722,15 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState)
if (iframe* frame = x86_get_user_iframe()) { if (iframe* frame = x86_get_user_iframe()) {
// For the floating point state to be correct the calling function must // For the floating point state to be correct the calling function must
// not use these registers (not even indirectly). // not use these registers (not even indirectly).
#ifdef __x86_64__
if (frame->fpu != nullptr) {
memcpy(&cpuState->extended_registers, frame->fpu,
sizeof(cpuState->extended_registers));
} else {
memset(&cpuState->extended_registers, 0,
sizeof(cpuState->extended_registers));
}
#else
if (gHasSSE) { if (gHasSSE) {
// Since fxsave requires 16-byte alignment and this isn't guaranteed // Since fxsave requires 16-byte alignment and this isn't guaranteed
// passed buffer, we use our thread's fpu_state field as temporary // passed buffer, we use our thread's fpu_state field as temporary
@ -725,15 +741,14 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState)
// unlike fnsave, fxsave doesn't reinit the FPU state // unlike fnsave, fxsave doesn't reinit the FPU state
memcpy(&cpuState->extended_registers, thread->arch_info.fpu_state, memcpy(&cpuState->extended_registers, thread->arch_info.fpu_state,
sizeof(cpuState->extended_registers)); sizeof(cpuState->extended_registers));
#ifndef __x86_64__
} else { } else {
x86_fnsave(&cpuState->extended_registers); x86_fnsave(&cpuState->extended_registers);
x86_frstor(&cpuState->extended_registers); x86_frstor(&cpuState->extended_registers);
// fnsave reinits the FPU state after saving, so we need to // fnsave reinits the FPU state after saving, so we need to
// load it again // load it again
// TODO: Convert to fxsave format! // TODO: Convert to fxsave format!
#endif
} }
#endif
get_iframe_registers(frame, cpuState); get_iframe_registers(frame, cpuState);
} }
} }

View File

@ -70,6 +70,7 @@ dummy()
DEFINE_OFFSET_MACRO(IFRAME, iframe, r8); DEFINE_OFFSET_MACRO(IFRAME, iframe, r8);
DEFINE_OFFSET_MACRO(IFRAME, iframe, r9); DEFINE_OFFSET_MACRO(IFRAME, iframe, r9);
DEFINE_OFFSET_MACRO(IFRAME, iframe, r10); DEFINE_OFFSET_MACRO(IFRAME, iframe, r10);
DEFINE_OFFSET_MACRO(IFRAME, iframe, fpu);
#else #else
DEFINE_OFFSET_MACRO(IFRAME, iframe, orig_eax); DEFINE_OFFSET_MACRO(IFRAME, iframe, orig_eax);
#endif #endif
@ -79,12 +80,6 @@ dummy()
DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, function); DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, function);
DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, parameter_size); DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, parameter_size);
// struct x86_optimized_functions
DEFINE_OFFSET_MACRO(X86_OPTIMIZED_FUNCTIONS, x86_optimized_functions,
memcpy);
DEFINE_OFFSET_MACRO(X86_OPTIMIZED_FUNCTIONS, x86_optimized_functions,
memset);
// struct signal_frame_data // struct signal_frame_data
DEFINE_SIZEOF_MACRO(SIGNAL_FRAME_DATA, signal_frame_data); DEFINE_SIZEOF_MACRO(SIGNAL_FRAME_DATA, signal_frame_data);
DEFINE_OFFSET_MACRO(SIGNAL_FRAME_DATA, signal_frame_data, info); DEFINE_OFFSET_MACRO(SIGNAL_FRAME_DATA, signal_frame_data, info);

View File

@ -6,22 +6,12 @@
* Distributed under the terms of the NewOS License. * Distributed under the terms of the NewOS License.
*/ */
#if !_BOOT_MODE
# include "asm_offsets.h"
#endif
#include <asm_defs.h> #include <asm_defs.h>
// We don't need the indirection in the boot loader.
#if _BOOT_MODE
# define memcpy_generic memcpy
# define memset_generic memset
#endif
.align 4 .align 4
FUNCTION(memcpy_generic): FUNCTION(memcpy):
pushl %esi pushl %esi
pushl %edi pushl %edi
movl 12(%esp),%edi /* dest */ movl 12(%esp),%edi /* dest */
@ -45,13 +35,13 @@ FUNCTION(memcpy_generic):
popl %edi popl %edi
popl %esi popl %esi
ret ret
FUNCTION_END(memcpy_generic) FUNCTION_END(memcpy)
SYMBOL(memcpy_generic_end): SYMBOL(memcpy_end):
/* void *memset(void *dest, int value, size_t length); */ /* void *memset(void *dest, int value, size_t length); */
.align 4 .align 4
FUNCTION(memset_generic): FUNCTION(memset):
push %ebp push %ebp
mov %esp, %ebp mov %esp, %ebp
@ -111,19 +101,6 @@ FUNCTION(memset_generic):
mov %ebp, %esp mov %ebp, %esp
pop %ebp pop %ebp
ret ret
FUNCTION_END(memset_generic)
SYMBOL(memset_generic_end):
#if !_BOOT_MODE
.align 4
FUNCTION(memcpy):
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memcpy)
FUNCTION_END(memcpy)
FUNCTION(memset):
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memset)
FUNCTION_END(memset) FUNCTION_END(memset)
SYMBOL(memset_end):
#endif // !_BOOT_MODE

View File

@ -21,6 +21,7 @@ KernelMergeObject kernel_os_arch_$(TARGET_ARCH).o :
; ;
SEARCH_SOURCE += [ FDirName $(posixSources) arch $(TARGET_ARCH) ] ; SEARCH_SOURCE += [ FDirName $(posixSources) arch $(TARGET_ARCH) ] ;
SEARCH_SOURCE += [ FDirName $(posixSources) string arch $(TARGET_ARCH) ] ;
KernelMergeObject kernel_lib_posix_arch_$(TARGET_ARCH).o : KernelMergeObject kernel_lib_posix_arch_$(TARGET_ARCH).o :
siglongjmp.S siglongjmp.S
@ -28,12 +29,8 @@ KernelMergeObject kernel_lib_posix_arch_$(TARGET_ARCH).o :
kernel_longjmp_return.c kernel_longjmp_return.c
kernel_setjmp_save_sigs.c kernel_setjmp_save_sigs.c
arch_string.S arch_string.cpp
: $(TARGET_KERNEL_PIC_CCFLAGS) : $(TARGET_KERNEL_PIC_CCFLAGS)
; ;
# Explicitly tell the build system that arch_string.S includes the generated
# asm_offsets.h.
Includes [ FGristFiles arch_string.S ]
: <src!system!kernel!arch!x86>asm_offsets.h ;

View File

@ -1,96 +0,0 @@
/*
* Copyright 2012, Alex Smith, alex@alex-smith.me.uk.
* Distributed under the terms of the MIT License.
*/
#include <asm_defs.h>
#include "asm_offsets.h"
.align 8
FUNCTION(memcpy_generic):
push %rbp
movq %rsp, %rbp
// Preserve original destination address for return value.
movq %rdi, %rax
// size -> %rcx
movq %rdx, %rcx
// For small copies, always do it bytewise, the additional overhead is
// not worth it.
cmp $24, %rcx
jl .Lmemcpy_generic_byte_copy
// Do both source and dest have the same alignment?
movq %rsi, %r8
xorq %rdi, %r8
test $7, %r8
jnz .Lmemcpy_generic_byte_copy
// Align up to an 8-byte boundary.
movq %rdi, %r8
andq $7, %r8
jz .Lmemcpy_generic_qword_copy
movq $8, %rcx
subq %r8, %rcx
subq %rcx, %rdx // Subtract from the overall count.
rep
movsb
// Get back the original count value.
movq %rdx, %rcx
.Lmemcpy_generic_qword_copy:
// Move by quadwords.
shrq $3, %rcx
rep
movsq
// Get the remaining count.
movq %rdx, %rcx
andq $7, %rcx
.Lmemcpy_generic_byte_copy:
// Move any remaining data by bytes.
rep
movsb
pop %rbp
ret
FUNCTION_END(memcpy_generic)
SYMBOL(memcpy_generic_end):
.align 8
FUNCTION(memset_generic):
push %rbp
movq %rsp, %rbp
// Preserve original destination address for return value.
movq %rdi, %r8
// size -> %rcx, value -> %al
movq %rdx, %rcx
movl %esi, %eax
// Move by bytes.
rep
stosb
movq %r8, %rax
pop %rbp
ret
FUNCTION_END(memset_generic)
SYMBOL(memset_generic_end):
FUNCTION(memcpy):
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memcpy)
FUNCTION_END(memcpy)
FUNCTION(memset):
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memset)
FUNCTION_END(memset)

View File

@ -1,5 +1,7 @@
SubDir HAIKU_TOP src system libroot posix string arch x86_64 ; SubDir HAIKU_TOP src system libroot posix string arch x86_64 ;
SubDirC++Flags -std=gnu++11 ;
local architectureObject ; local architectureObject ;
for architectureObject in [ MultiArchSubDirSetup x86_64 ] { for architectureObject in [ MultiArchSubDirSetup x86_64 ] {
on $(architectureObject) { on $(architectureObject) {
@ -8,7 +10,7 @@ for architectureObject in [ MultiArchSubDirSetup x86_64 ] {
UsePrivateSystemHeaders ; UsePrivateSystemHeaders ;
MergeObject <$(architecture)>posix_string_arch_$(TARGET_ARCH).o : MergeObject <$(architecture)>posix_string_arch_$(TARGET_ARCH).o :
arch_string.S arch_string.cpp
; ;
} }
} }

View File

@ -1,22 +0,0 @@
/*
* Copyright 2008, Ingo Weinhold, ingo_weinhold@gmx.de.
* Distributed under the terms of the MIT License.
*/
#include <asm_defs.h>
#include <commpage_defs.h>
FUNCTION(memcpy):
movq __gCommPageAddress@GOTPCREL(%rip), %rax
movq (%rax), %rax
addq 8 * COMMPAGE_ENTRY_X86_MEMCPY(%rax), %rax
jmp *%rax
FUNCTION_END(memcpy)
FUNCTION(memset):
movq __gCommPageAddress@GOTPCREL(%rip), %rax
movq (%rax), %rax
addq 8 * COMMPAGE_ENTRY_X86_MEMSET(%rax), %rax
jmp *%rax
FUNCTION_END(memset)

View File

@ -0,0 +1,216 @@
/*
* Copyright 2014, Paweł Dziepak, pdziepak@quarnos.org.
* Distributed under the terms of the MIT License.
*/
#include <array>
#include <cstddef>
#include <cstdint>
#include <x86intrin.h>
namespace {
template<template<size_t N> class Generator, unsigned N, unsigned ...Index>
struct GenerateTable : GenerateTable<Generator, N - 1, N - 1, Index...> {
};
template<template<size_t N> class Generator, unsigned ...Index>
struct GenerateTable<Generator, 0, Index...>
: std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
constexpr GenerateTable()
:
std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
{ Generator<Index>::sValue... }
}
{
}
};
static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source,
size_t length)
{
__asm__ __volatile__("rep movsb"
: "+D" (destination), "+S" (source), "+c" (length)
:
: "memory");
}
template<size_t N>
inline void copy_small(uint8_t* destination, const uint8_t* source)
{
struct data {
uint8_t x[N];
};
*reinterpret_cast<data*>(destination)
= *reinterpret_cast<const data*>(source);
}
template<size_t N>
struct SmallGenerator {
constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>;
};
constexpr static GenerateTable<SmallGenerator, 8> table_small;
static inline void memcpy_small(uint8_t* destination, const uint8_t* source,
size_t length)
{
if (length < 8) {
table_small[length](destination, source);
} else {
auto to = reinterpret_cast<uint64_t*>(destination);
auto from = reinterpret_cast<const uint64_t*>(source);
*to = *from;
to = reinterpret_cast<uint64_t*>(destination + length - 8);
from = reinterpret_cast<const uint64_t*>(source + length - 8);
*to = *from;
}
}
template<size_t N>
inline void copy_sse(__m128i* destination, const __m128i* source)
{
auto temp = _mm_loadu_si128(source);
_mm_storeu_si128(destination, temp);
copy_sse<N - 1>(destination + 1, source + 1);
}
template<>
inline void copy_sse<0>(__m128i* destination, const __m128i* source)
{
}
template<size_t N>
struct SSEGenerator {
constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>;
};
constexpr static GenerateTable<SSEGenerator, 4> table_sse;
static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length)
{
auto to = reinterpret_cast<__m128i*>(destination);
auto from = reinterpret_cast<const __m128i*>(source);
auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16);
auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16);
while (length >= 64) {
copy_sse<4>(to, from);
to += 4;
from += 4;
length -= 64;
}
if (length >= 16) {
table_sse[length / 16](to, from);
length %= 16;
}
if (length) {
copy_sse<1>(toEnd, fromEnd);
}
}
}
extern "C" void* memcpy(void* destination, const void* source, size_t length)
{
auto to = static_cast<uint8_t*>(destination);
auto from = static_cast<const uint8_t*>(source);
if (length <= 16) {
memcpy_small(to, from, length);
return destination;
}
if (length < 2048) {
memcpy_sse(to, from, length);
return destination;
}
memcpy_repmovs(to, from, length);
return destination;
}
static inline void
memset_repstos(uint8_t* destination, uint8_t value, size_t length)
{
__asm__ __volatile__("rep stosb"
: "+D" (destination), "+c" (length)
: "a" (value)
: "memory");
}
static inline void
memset_sse(uint8_t* destination, uint8_t value, size_t length)
{
__m128i packed = _mm_set1_epi8(value);
auto end = reinterpret_cast<__m128i*>(destination + length - 16);
auto diff = reinterpret_cast<uintptr_t>(destination) % 16;
if (diff) {
diff = 16 - diff;
length -= diff;
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed);
}
auto ptr = reinterpret_cast<__m128i*>(destination + diff);
while (length >= 64) {
_mm_store_si128(ptr++, packed);
_mm_store_si128(ptr++, packed);
_mm_store_si128(ptr++, packed);
_mm_store_si128(ptr++, packed);
length -= 64;
}
while (length >= 16) {
_mm_store_si128(ptr++, packed);
length -= 16;
}
_mm_storeu_si128(end, packed);
}
static inline void
memset_small(uint8_t* destination, uint8_t value, size_t length)
{
if (length >= 8) {
auto packed = value * 0x101010101010101ul;
auto ptr = reinterpret_cast<uint64_t*>(destination);
auto end = reinterpret_cast<uint64_t*>(destination + length - 8);
while (length >= 8) {
*ptr++ = packed;
length -= 8;
}
*end = packed;
} else {
while (length--) {
*destination++ = value;
}
}
}
extern "C" void*
memset(void* ptr, int chr, size_t length)
{
auto value = static_cast<unsigned char>(chr);
auto destination = static_cast<uint8_t*>(ptr);
if (length < 32) {
memset_small(destination, value, length);
return ptr;
}
if (length < 2048) {
memset_sse(destination, value, length);
return ptr;
}
memset_repstos(destination, value, length);
return ptr;
}