Merge branch 'memcpy-v3'
This is a major rework of how Haiku implements memset() and memcpy() on x64. These functions are now removed from the commpage and reimplemented in C++ using sse2 where it proved to be beneficial. That required some serious changes in handling fpu state. Now, full fpu state is saved at each interrupt, but not on task switch. Some numbers: results of building targets: kernel libroot.so runtime_loader and HaikuDepot on Intel i7 4770 with 16GB of memory. real user sys before: 1m54.367 7m40.617 0m58.641 1m33.922 8m12.362 1m0.852 1m32.922 8m10.509 1m1.006 1m31.947 8m12.596 1m1.353 after: 1m50.959 7m43.118 0m58.923 1m30.644 8m6.385 1m0.584 1m31.549 8m7.976 0m59.792 1m31.546 8m6.733 1m0.242
This commit is contained in:
commit
e81b792e8f
@ -28,6 +28,10 @@ x86_write_msr(uint32_t msr, uint64_t value)
|
||||
static inline void
|
||||
x86_context_switch(arch_thread* oldState, arch_thread* newState)
|
||||
{
|
||||
uint16_t fpuControl;
|
||||
asm volatile("fnstcw %0" : "=m" (fpuControl));
|
||||
uint32_t sseControl;
|
||||
asm volatile("stmxcsr %0" : "=m" (sseControl));
|
||||
asm volatile(
|
||||
"pushq %%rbp;"
|
||||
"movq $1f, %c[rip](%0);"
|
||||
@ -41,7 +45,11 @@ x86_context_switch(arch_thread* oldState, arch_thread* newState)
|
||||
[rsp] "i" (offsetof(arch_thread, current_stack)),
|
||||
[rip] "i" (offsetof(arch_thread, instruction_pointer))
|
||||
: "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13",
|
||||
"r14", "r15", "memory");
|
||||
"r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||
"xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
|
||||
"xmm14", "xmm15", "memory");
|
||||
asm volatile("ldmxcsr %0" : : "m" (sseControl));
|
||||
asm volatile("fldcw %0" : : "m" (fpuControl));
|
||||
}
|
||||
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
struct iframe {
|
||||
uint64 type;
|
||||
void* fpu;
|
||||
uint64 r15;
|
||||
uint64 r14;
|
||||
uint64 r13;
|
||||
|
@ -243,6 +243,14 @@
|
||||
| X86_EFLAGS_AUXILIARY_CARRY | X86_EFLAGS_ZERO | X86_EFLAGS_SIGN \
|
||||
| X86_EFLAGS_DIRECTION | X86_EFLAGS_OVERFLOW)
|
||||
|
||||
#define CR0_CACHE_DISABLE (1UL << 30)
|
||||
#define CR0_NOT_WRITE_THROUGH (1UL << 29)
|
||||
#define CR0_FPU_EMULATION (1UL << 2)
|
||||
#define CR0_MONITOR_FPU (1UL << 1)
|
||||
|
||||
#define CR4_OS_FXSR (1UL << 9)
|
||||
#define CR4_OS_XMM_EXCEPTION (1UL << 10)
|
||||
|
||||
|
||||
// iframe types
|
||||
#define IFRAME_TYPE_SYSCALL 0x1
|
||||
@ -262,13 +270,6 @@ typedef struct x86_mtrr_info {
|
||||
uint8 type;
|
||||
} x86_mtrr_info;
|
||||
|
||||
typedef struct x86_optimized_functions {
|
||||
void (*memcpy)(void* dest, const void* source, size_t count);
|
||||
void* memcpy_end;
|
||||
void (*memset)(void* dest, int value, size_t count);
|
||||
void* memset_end;
|
||||
} x86_optimized_functions;
|
||||
|
||||
typedef struct x86_cpu_module_info {
|
||||
module_info info;
|
||||
uint32 (*count_mtrrs)(void);
|
||||
@ -280,8 +281,6 @@ typedef struct x86_cpu_module_info {
|
||||
uint8* _type);
|
||||
void (*set_mtrrs)(uint8 defaultType, const x86_mtrr_info* infos,
|
||||
uint32 count);
|
||||
|
||||
void (*get_optimized_functions)(x86_optimized_functions* functions);
|
||||
} x86_cpu_module_info;
|
||||
|
||||
// features
|
||||
@ -455,10 +454,7 @@ void __x86_setup_system_time(uint32 conversionFactor,
|
||||
|
||||
void x86_userspace_thread_exit(void);
|
||||
void x86_end_userspace_thread_exit(void);
|
||||
void x86_fxsave(void* fpuState);
|
||||
void x86_fxrstor(const void* fpuState);
|
||||
void x86_noop_swap(void* oldFpuState, const void* newFpuState);
|
||||
void x86_fxsave_swap(void* oldFpuState, const void* newFpuState);
|
||||
|
||||
addr_t x86_get_stack_frame();
|
||||
uint32 x86_count_mtrrs(void);
|
||||
void x86_set_mtrr(uint32 index, uint64 base, uint64 length, uint8 type);
|
||||
@ -489,7 +485,13 @@ void x86_context_switch(struct arch_thread* oldState,
|
||||
|
||||
void x86_fnsave(void* fpuState);
|
||||
void x86_frstor(const void* fpuState);
|
||||
|
||||
void x86_fxsave(void* fpuState);
|
||||
void x86_fxrstor(const void* fpuState);
|
||||
|
||||
void x86_noop_swap(void* oldFpuState, const void* newFpuState);
|
||||
void x86_fnsave_swap(void* oldFpuState, const void* newFpuState);
|
||||
void x86_fxsave_swap(void* oldFpuState, const void* newFpuState);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -9,11 +9,9 @@
|
||||
# error Must not be included directly. Include <commpage_defs.h> instead!
|
||||
#endif
|
||||
|
||||
#define COMMPAGE_ENTRY_X86_MEMCPY (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 0)
|
||||
#define COMMPAGE_ENTRY_X86_MEMSET (COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 1)
|
||||
#define COMMPAGE_ENTRY_X86_SIGNAL_HANDLER \
|
||||
(COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 2)
|
||||
(COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 0)
|
||||
#define COMMPAGE_ENTRY_X86_THREAD_EXIT \
|
||||
(COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 3)
|
||||
(COMMPAGE_ENTRY_FIRST_ARCH_SPECIFIC + 1)
|
||||
|
||||
#endif /* _SYSTEM_ARCH_x86_64_COMMPAGE_DEFS_H */
|
||||
|
@ -278,6 +278,14 @@ convert_kernel_args()
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
enable_sse()
|
||||
{
|
||||
x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION);
|
||||
x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU));
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
long_smp_start_kernel(void)
|
||||
{
|
||||
@ -287,6 +295,7 @@ long_smp_start_kernel(void)
|
||||
asm("movl %%eax, %%cr0" : : "a" ((1 << 31) | (1 << 16) | (1 << 5) | 1));
|
||||
asm("cld");
|
||||
asm("fninit");
|
||||
enable_sse();
|
||||
|
||||
// Fix our kernel stack address.
|
||||
gKernelArgs.cpu_kstack[cpu].start
|
||||
@ -308,6 +317,8 @@ long_start_kernel()
|
||||
if ((info.regs.edx & (1 << 29)) == 0)
|
||||
panic("64-bit kernel requires a 64-bit CPU");
|
||||
|
||||
enable_sse();
|
||||
|
||||
preloaded_elf64_image *image = static_cast<preloaded_elf64_image *>(
|
||||
gKernelArgs.kernel_image.Pointer());
|
||||
|
||||
|
@ -30,6 +30,10 @@ extern "C" void x86_sysenter();
|
||||
void (*gX86SetSyscallStack)(addr_t stackTop) = NULL;
|
||||
|
||||
|
||||
extern int memcpy_end;
|
||||
extern int memset_end;
|
||||
|
||||
|
||||
static bool
|
||||
all_cpus_have_feature(enum x86_feature_type type, int feature)
|
||||
{
|
||||
@ -109,8 +113,20 @@ x86_initialize_syscall(void)
|
||||
addr_t position = fill_commpage_entry(COMMPAGE_ENTRY_X86_SYSCALL,
|
||||
syscallCode, len);
|
||||
|
||||
// put the optimized functions into the commpage
|
||||
size_t memcpyLen = (addr_t)&memcpy_end - (addr_t)memcpy;
|
||||
addr_t memcpyPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMCPY,
|
||||
(const void*)memcpy, memcpyLen);
|
||||
size_t memsetLen = (addr_t)&memset_end - (addr_t)memset;
|
||||
addr_t memsetPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMSET,
|
||||
(const void*)memset, memsetLen);
|
||||
|
||||
// add syscall to the commpage image
|
||||
image_id image = get_commpage_image();
|
||||
elf_add_memory_image_symbol(image, "commpage_memcpy", memcpyPosition,
|
||||
memcpyLen, B_SYMBOL_TYPE_TEXT);
|
||||
elf_add_memory_image_symbol(image, "commpage_memset", memsetPosition,
|
||||
memsetLen, B_SYMBOL_TYPE_TEXT);
|
||||
elf_add_memory_image_symbol(image, "commpage_syscall", position, len,
|
||||
B_SYMBOL_TYPE_TEXT);
|
||||
}
|
||||
|
@ -19,35 +19,6 @@
|
||||
.text
|
||||
|
||||
|
||||
/* void x86_fxsave(void* fpuState); */
|
||||
FUNCTION(x86_fxsave):
|
||||
fxsave (%rdi)
|
||||
ret
|
||||
FUNCTION_END(x86_fxsave)
|
||||
|
||||
|
||||
/* void x86_fxrstor(const void* fpuState); */
|
||||
FUNCTION(x86_fxrstor):
|
||||
fxrstor (%rdi)
|
||||
ret
|
||||
FUNCTION_END(x86_fxrstor)
|
||||
|
||||
|
||||
/* void x86_noop_swap(void *oldFpuState, const void *newFpuState); */
|
||||
FUNCTION(x86_noop_swap):
|
||||
nop
|
||||
ret
|
||||
FUNCTION_END(x86_noop_swap)
|
||||
|
||||
|
||||
/* void x86_fxsave_swap(void* oldFpuState, const void* newFpuState); */
|
||||
FUNCTION(x86_fxsave_swap):
|
||||
fxsave (%rdi)
|
||||
fxrstor (%rsi)
|
||||
ret
|
||||
FUNCTION_END(x86_fxsave_swap)
|
||||
|
||||
|
||||
/* addr_t x86_get_stack_frame(); */
|
||||
FUNCTION(x86_get_stack_frame):
|
||||
mov %rbp, %rax
|
||||
|
@ -35,11 +35,13 @@
|
||||
push %r13; \
|
||||
push %r14; \
|
||||
push %r15; \
|
||||
pushq $0; \
|
||||
push $iframeType;
|
||||
|
||||
|
||||
// Restore the interrupt frame.
|
||||
#define RESTORE_IFRAME() \
|
||||
add $8, %rsp; \
|
||||
add $16, %rsp; \
|
||||
pop %r15; \
|
||||
pop %r14; \
|
||||
pop %r13; \
|
||||
@ -116,6 +118,23 @@
|
||||
call x86_exit_user_debug_at_kernel_entry; \
|
||||
1:
|
||||
|
||||
#define CLEAR_FPU_STATE() \
|
||||
pxor %xmm0, %xmm0; \
|
||||
pxor %xmm1, %xmm1; \
|
||||
pxor %xmm2, %xmm2; \
|
||||
pxor %xmm3, %xmm3; \
|
||||
pxor %xmm4, %xmm4; \
|
||||
pxor %xmm5, %xmm5; \
|
||||
pxor %xmm6, %xmm6; \
|
||||
pxor %xmm7, %xmm7; \
|
||||
pxor %xmm8, %xmm8; \
|
||||
pxor %xmm9, %xmm9; \
|
||||
pxor %xmm10, %xmm10; \
|
||||
pxor %xmm11, %xmm11; \
|
||||
pxor %xmm12, %xmm12; \
|
||||
pxor %xmm13, %xmm13; \
|
||||
pxor %xmm14, %xmm14; \
|
||||
pxor %xmm15, %xmm15
|
||||
|
||||
// The following code defines the interrupt service routines for all 256
|
||||
// interrupts. It creates a block of handlers, each 16 bytes, that the IDT
|
||||
@ -198,11 +217,18 @@ STATIC_FUNCTION(int_bottom):
|
||||
// exception.
|
||||
orq $X86_EFLAGS_RESUME, IFRAME_flags(%rbp)
|
||||
|
||||
subq $512, %rsp
|
||||
andq $~15, %rsp
|
||||
fxsaveq (%rsp)
|
||||
|
||||
// Call the interrupt handler.
|
||||
movq %rsp, %rdi
|
||||
movq IFRAME_vector(%rsp), %rax
|
||||
movq %rbp, %rdi
|
||||
movq IFRAME_vector(%rbp), %rax
|
||||
call *gInterruptHandlerTable(, %rax, 8)
|
||||
|
||||
fxrstorq (%rsp)
|
||||
movq %rbp, %rsp
|
||||
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
@ -217,12 +243,16 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
|
||||
// Push the rest of the interrupt frame to the stack.
|
||||
PUSH_IFRAME_BOTTOM(IFRAME_TYPE_OTHER)
|
||||
|
||||
cld
|
||||
|
||||
// Frame pointer is the iframe.
|
||||
movq %rsp, %rbp
|
||||
|
||||
subq $512, %rsp
|
||||
andq $~15, %rsp
|
||||
fxsaveq (%rsp)
|
||||
movq %rsp, IFRAME_fpu(%rbp)
|
||||
|
||||
// Set the RF (resume flag) in RFLAGS. This prevents an instruction
|
||||
// breakpoint on the instruction we're returning to to trigger a debug
|
||||
// exception.
|
||||
@ -235,8 +265,8 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
UPDATE_THREAD_USER_TIME()
|
||||
|
||||
// Call the interrupt handler.
|
||||
movq %rsp, %rdi
|
||||
movq IFRAME_vector(%rsp), %rax
|
||||
movq %rbp, %rdi
|
||||
movq IFRAME_vector(%rbp), %rax
|
||||
call *gInterruptHandlerTable(, %rax, 8)
|
||||
|
||||
// If there are no signals pending or we're not debugging, we can avoid
|
||||
@ -250,6 +280,9 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
|
||||
UPDATE_THREAD_KERNEL_TIME()
|
||||
|
||||
fxrstorq (%rsp)
|
||||
movq %rbp, %rsp
|
||||
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
@ -274,6 +307,9 @@ STATIC_FUNCTION(int_bottom_user):
|
||||
movq %rbp, %rdi
|
||||
call x86_init_user_debug_at_kernel_exit
|
||||
1:
|
||||
fxrstorq (%rsp)
|
||||
movq %rbp, %rsp
|
||||
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
@ -315,6 +351,7 @@ FUNCTION(x86_64_syscall_entry):
|
||||
|
||||
// Frame pointer is the iframe.
|
||||
movq %rsp, %rbp
|
||||
andq $~15, %rsp
|
||||
|
||||
// Preserve call number (R14 is callee-save), get thread pointer.
|
||||
movq %rax, %r14
|
||||
@ -367,10 +404,10 @@ FUNCTION(x86_64_syscall_entry):
|
||||
|
||||
// TODO: post-syscall tracing
|
||||
|
||||
.Lsyscall_return:
|
||||
// Restore the original stack pointer and return.
|
||||
movq %rbp, %rsp
|
||||
|
||||
.Lsyscall_return:
|
||||
// Clear the restarted flag.
|
||||
testl $THREAD_FLAGS_SYSCALL_RESTARTED, THREAD_flags(%r12)
|
||||
jz 2f
|
||||
@ -394,7 +431,9 @@ FUNCTION(x86_64_syscall_entry):
|
||||
|
||||
// If we've just restored a signal frame, use the IRET path.
|
||||
cmpq $SYSCALL_RESTORE_SIGNAL_FRAME, %r14
|
||||
je .Liret
|
||||
je .Lrestore_fpu
|
||||
|
||||
CLEAR_FPU_STATE()
|
||||
|
||||
// Restore the iframe and RCX/R11 for SYSRET.
|
||||
RESTORE_IFRAME()
|
||||
@ -458,14 +497,19 @@ FUNCTION(x86_64_syscall_entry):
|
||||
1:
|
||||
// Install breakpoints, if defined.
|
||||
testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12)
|
||||
jz .Liret
|
||||
jz 1f
|
||||
movq %rbp, %rdi
|
||||
call x86_init_user_debug_at_kernel_exit
|
||||
|
||||
1:
|
||||
// On this return path it is possible that the frame has been modified,
|
||||
// for example to execute a signal handler. In this case it is safer to
|
||||
// return via IRET.
|
||||
CLEAR_FPU_STATE()
|
||||
jmp .Liret
|
||||
|
||||
.Lrestore_fpu:
|
||||
movq IFRAME_fpu(%rbp), %rax
|
||||
fxrstorq (%rax)
|
||||
.Liret:
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
@ -493,6 +537,7 @@ FUNCTION(x86_64_syscall_entry):
|
||||
|
||||
// Make space on the stack.
|
||||
subq %rcx, %rsp
|
||||
andq $~15, %rsp
|
||||
movq %rsp, %rdi
|
||||
|
||||
// Set a fault handler.
|
||||
@ -535,7 +580,7 @@ FUNCTION(x86_return_to_userland):
|
||||
testl $(THREAD_FLAGS_DEBUGGER_INSTALLED | THREAD_FLAGS_SIGNALS_PENDING \
|
||||
| THREAD_FLAGS_DEBUG_THREAD | THREAD_FLAGS_BREAKPOINTS_DEFINED) \
|
||||
, THREAD_flags(%r12)
|
||||
jnz .Lkernel_exit_work
|
||||
jnz .Luserland_return_work
|
||||
|
||||
// update the thread's kernel time and return
|
||||
UPDATE_THREAD_KERNEL_TIME()
|
||||
@ -544,4 +589,33 @@ FUNCTION(x86_return_to_userland):
|
||||
RESTORE_IFRAME()
|
||||
swapgs
|
||||
iretq
|
||||
.Luserland_return_work:
|
||||
// Slow path for return to userland.
|
||||
|
||||
// Do we need to handle signals?
|
||||
testl $(THREAD_FLAGS_SIGNALS_PENDING | THREAD_FLAGS_DEBUG_THREAD) \
|
||||
, THREAD_flags(%r12)
|
||||
jnz .Luserland_return_handle_signals
|
||||
cli
|
||||
call thread_at_kernel_exit_no_signals
|
||||
|
||||
.Luserland_return_work_done:
|
||||
// Install breakpoints, if defined.
|
||||
testl $THREAD_FLAGS_BREAKPOINTS_DEFINED, THREAD_flags(%r12)
|
||||
jz 1f
|
||||
movq %rbp, %rdi
|
||||
call x86_init_user_debug_at_kernel_exit
|
||||
1:
|
||||
// Restore the saved registers.
|
||||
RESTORE_IFRAME()
|
||||
|
||||
// Restore the previous GS base and return.
|
||||
swapgs
|
||||
iretq
|
||||
.Luserland_return_handle_signals:
|
||||
// thread_at_kernel_exit requires interrupts to be enabled, it will disable
|
||||
// them after.
|
||||
sti
|
||||
call thread_at_kernel_exit
|
||||
jmp .Luserland_return_work_done
|
||||
FUNCTION_END(x86_return_to_userland)
|
||||
|
@ -134,9 +134,12 @@ arch_thread_init(kernel_args* args)
|
||||
{
|
||||
// Save one global valid FPU state; it will be copied in the arch dependent
|
||||
// part of each new thread.
|
||||
asm volatile ("clts; fninit; fnclex;");
|
||||
x86_fxsave(sInitialState.fpu_state);
|
||||
|
||||
asm volatile (
|
||||
"clts;" \
|
||||
"fninit;" \
|
||||
"fnclex;" \
|
||||
"fxsave %0;"
|
||||
: "=m" (sInitialState.fpu_state));
|
||||
return B_OK;
|
||||
}
|
||||
|
||||
@ -296,15 +299,14 @@ arch_setup_signal_frame(Thread* thread, struct sigaction* action,
|
||||
signalFrameData->context.uc_mcontext.rip = frame->ip;
|
||||
signalFrameData->context.uc_mcontext.rflags = frame->flags;
|
||||
|
||||
// Store the FPU state. There appears to be a bug in GCC where the aligned
|
||||
// attribute on a structure is being ignored when the structure is allocated
|
||||
// on the stack, so even if the fpu_state struct has aligned(16) it may not
|
||||
// get aligned correctly. Instead, use the current thread's FPU save area
|
||||
// and then memcpy() to the frame structure.
|
||||
x86_fxsave(thread->arch_info.fpu_state);
|
||||
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
|
||||
thread->arch_info.fpu_state,
|
||||
sizeof(signalFrameData->context.uc_mcontext.fpu));
|
||||
if (frame->fpu != nullptr) {
|
||||
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu, frame->fpu,
|
||||
sizeof(signalFrameData->context.uc_mcontext.fpu));
|
||||
} else {
|
||||
memcpy((void*)&signalFrameData->context.uc_mcontext.fpu,
|
||||
sInitialState.fpu_state,
|
||||
sizeof(signalFrameData->context.uc_mcontext.fpu));
|
||||
}
|
||||
|
||||
// Fill in signalFrameData->context.uc_stack.
|
||||
signal_get_user_stack(frame->user_sp, &signalFrameData->context.uc_stack);
|
||||
@ -370,13 +372,12 @@ arch_restore_signal_frame(struct signal_frame_data* signalFrameData)
|
||||
frame->flags = (frame->flags & ~(uint64)X86_EFLAGS_USER_FLAGS)
|
||||
| (signalFrameData->context.uc_mcontext.rflags & X86_EFLAGS_USER_FLAGS);
|
||||
|
||||
// Same as above, alignment may not be correct. Copy to thread and restore
|
||||
// from there.
|
||||
Thread* thread = thread_get_current_thread();
|
||||
|
||||
memcpy(thread->arch_info.fpu_state,
|
||||
(void*)&signalFrameData->context.uc_mcontext.fpu,
|
||||
sizeof(thread->arch_info.fpu_state));
|
||||
x86_fxrstor(thread->arch_info.fpu_state);
|
||||
frame->fpu = &thread->arch_info.fpu_state;
|
||||
|
||||
// The syscall return code overwrites frame->ax with the return value of
|
||||
// the syscall, need to return it here to ensure the correct value is
|
||||
|
@ -59,14 +59,6 @@ static const struct cpu_vendor_info vendor_info[VENDOR_NUM] = {
|
||||
{ "NSC", { "Geode by NSC" } },
|
||||
};
|
||||
|
||||
#define CR0_CACHE_DISABLE (1UL << 30)
|
||||
#define CR0_NOT_WRITE_THROUGH (1UL << 29)
|
||||
#define CR0_FPU_EMULATION (1UL << 2)
|
||||
#define CR0_MONITOR_FPU (1UL << 1)
|
||||
|
||||
#define CR4_OS_FXSR (1UL << 9)
|
||||
#define CR4_OS_XMM_EXCEPTION (1UL << 10)
|
||||
|
||||
#define K8_SMIONCMPHALT (1ULL << 27)
|
||||
#define K8_C1EONCMPHALT (1ULL << 28)
|
||||
|
||||
@ -90,8 +82,10 @@ extern "C" void x86_reboot(void);
|
||||
// from arch.S
|
||||
|
||||
void (*gCpuIdleFunc)(void);
|
||||
#ifndef __x86_64__
|
||||
void (*gX86SwapFPUFunc)(void* oldState, const void* newState) = x86_noop_swap;
|
||||
bool gHasSSE = false;
|
||||
#endif
|
||||
|
||||
static uint32 sCpuRendezvous;
|
||||
static uint32 sCpuRendezvous2;
|
||||
@ -105,18 +99,6 @@ static const size_t kDoubleFaultStackSize = 4096; // size per CPU
|
||||
static x86_cpu_module_info* sCpuModule;
|
||||
|
||||
|
||||
extern "C" void memcpy_generic(void* dest, const void* source, size_t count);
|
||||
extern int memcpy_generic_end;
|
||||
extern "C" void memset_generic(void* dest, int value, size_t count);
|
||||
extern int memset_generic_end;
|
||||
|
||||
x86_optimized_functions gOptimizedFunctions = {
|
||||
memcpy_generic,
|
||||
&memcpy_generic_end,
|
||||
memset_generic,
|
||||
&memset_generic_end
|
||||
};
|
||||
|
||||
/* CPU topology information */
|
||||
static uint32 (*sGetCPUTopologyID)(int currentCPU);
|
||||
static uint32 sHierarchyMask[CPU_TOPOLOGY_LEVELS];
|
||||
@ -338,13 +320,14 @@ x86_init_fpu(void)
|
||||
#endif
|
||||
|
||||
dprintf("%s: CPU has SSE... enabling FXSR and XMM.\n", __func__);
|
||||
|
||||
#ifndef __x86_64__
|
||||
// enable OS support for SSE
|
||||
x86_write_cr4(x86_read_cr4() | CR4_OS_FXSR | CR4_OS_XMM_EXCEPTION);
|
||||
x86_write_cr0(x86_read_cr0() & ~(CR0_FPU_EMULATION | CR0_MONITOR_FPU));
|
||||
|
||||
gX86SwapFPUFunc = x86_fxsave_swap;
|
||||
gHasSSE = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -1163,33 +1146,6 @@ arch_cpu_init_post_modules(kernel_args* args)
|
||||
call_all_cpus(&init_mtrrs, NULL);
|
||||
}
|
||||
|
||||
// get optimized functions from the CPU module
|
||||
if (sCpuModule != NULL && sCpuModule->get_optimized_functions != NULL) {
|
||||
x86_optimized_functions functions;
|
||||
memset(&functions, 0, sizeof(functions));
|
||||
|
||||
sCpuModule->get_optimized_functions(&functions);
|
||||
|
||||
if (functions.memcpy != NULL) {
|
||||
gOptimizedFunctions.memcpy = functions.memcpy;
|
||||
gOptimizedFunctions.memcpy_end = functions.memcpy_end;
|
||||
}
|
||||
|
||||
if (functions.memset != NULL) {
|
||||
gOptimizedFunctions.memset = functions.memset;
|
||||
gOptimizedFunctions.memset_end = functions.memset_end;
|
||||
}
|
||||
}
|
||||
|
||||
// put the optimized functions into the commpage
|
||||
size_t memcpyLen = (addr_t)gOptimizedFunctions.memcpy_end
|
||||
- (addr_t)gOptimizedFunctions.memcpy;
|
||||
addr_t memcpyPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMCPY,
|
||||
(const void*)gOptimizedFunctions.memcpy, memcpyLen);
|
||||
size_t memsetLen = (addr_t)gOptimizedFunctions.memset_end
|
||||
- (addr_t)gOptimizedFunctions.memset;
|
||||
addr_t memsetPosition = fill_commpage_entry(COMMPAGE_ENTRY_X86_MEMSET,
|
||||
(const void*)gOptimizedFunctions.memset, memsetLen);
|
||||
size_t threadExitLen = (addr_t)x86_end_userspace_thread_exit
|
||||
- (addr_t)x86_userspace_thread_exit;
|
||||
addr_t threadExitPosition = fill_commpage_entry(
|
||||
@ -1198,10 +1154,7 @@ arch_cpu_init_post_modules(kernel_args* args)
|
||||
|
||||
// add the functions to the commpage image
|
||||
image_id image = get_commpage_image();
|
||||
elf_add_memory_image_symbol(image, "commpage_memcpy", memcpyPosition,
|
||||
memcpyLen, B_SYMBOL_TYPE_TEXT);
|
||||
elf_add_memory_image_symbol(image, "commpage_memset", memsetPosition,
|
||||
memsetLen, B_SYMBOL_TYPE_TEXT);
|
||||
|
||||
elf_add_memory_image_symbol(image, "commpage_thread_exit",
|
||||
threadExitPosition, threadExitLen, B_SYMBOL_TYPE_TEXT);
|
||||
|
||||
|
@ -31,7 +31,9 @@
|
||||
extern "C" void x86_return_to_userland(iframe* frame);
|
||||
|
||||
// from arch_cpu.cpp
|
||||
#ifndef __x86_64__
|
||||
extern void (*gX86SwapFPUFunc)(void *oldState, const void *newState);
|
||||
#endif
|
||||
|
||||
|
||||
static struct iframe*
|
||||
@ -245,7 +247,9 @@ arch_thread_context_switch(Thread* from, Thread* to)
|
||||
activePagingStructures->RemoveReference();
|
||||
}
|
||||
|
||||
#ifndef __x86_64__
|
||||
gX86SwapFPUFunc(from->arch_info.fpu_state, to->arch_info.fpu_state);
|
||||
#endif
|
||||
x86_context_switch(&from->arch_info, &to->arch_info);
|
||||
}
|
||||
|
||||
|
@ -33,7 +33,9 @@
|
||||
// TODO: Make those real error codes.
|
||||
|
||||
|
||||
#ifndef __x86_64__
|
||||
extern bool gHasSSE;
|
||||
#endif
|
||||
|
||||
// The software breakpoint instruction (int3).
|
||||
const uint8 kX86SoftwareBreakpoint[1] = { 0xcc };
|
||||
@ -688,6 +690,12 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState)
|
||||
if (iframe* frame = x86_get_user_iframe()) {
|
||||
// For the floating point state to be correct the calling function must
|
||||
// not use these registers (not even indirectly).
|
||||
#ifdef __x86_64__
|
||||
Thread* thread = thread_get_current_thread();
|
||||
memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers,
|
||||
sizeof(cpuState->extended_registers));
|
||||
frame->fpu = &thread->arch_info.fpu_state;
|
||||
#else
|
||||
if (gHasSSE) {
|
||||
// Since fxrstor requires 16-byte alignment and this isn't
|
||||
// guaranteed passed buffer, we use our thread's fpu_state field as
|
||||
@ -698,12 +706,11 @@ arch_set_debug_cpu_state(const debug_cpu_state* cpuState)
|
||||
memcpy(thread->arch_info.fpu_state, &cpuState->extended_registers,
|
||||
sizeof(cpuState->extended_registers));
|
||||
x86_fxrstor(thread->arch_info.fpu_state);
|
||||
#ifndef __x86_64__
|
||||
} else {
|
||||
// TODO: Implement! We need to convert the format first.
|
||||
// x86_frstor(&cpuState->extended_registers);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
set_iframe_registers(frame, cpuState);
|
||||
}
|
||||
}
|
||||
@ -715,6 +722,15 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState)
|
||||
if (iframe* frame = x86_get_user_iframe()) {
|
||||
// For the floating point state to be correct the calling function must
|
||||
// not use these registers (not even indirectly).
|
||||
#ifdef __x86_64__
|
||||
if (frame->fpu != nullptr) {
|
||||
memcpy(&cpuState->extended_registers, frame->fpu,
|
||||
sizeof(cpuState->extended_registers));
|
||||
} else {
|
||||
memset(&cpuState->extended_registers, 0,
|
||||
sizeof(cpuState->extended_registers));
|
||||
}
|
||||
#else
|
||||
if (gHasSSE) {
|
||||
// Since fxsave requires 16-byte alignment and this isn't guaranteed
|
||||
// passed buffer, we use our thread's fpu_state field as temporary
|
||||
@ -725,15 +741,14 @@ arch_get_debug_cpu_state(debug_cpu_state* cpuState)
|
||||
// unlike fnsave, fxsave doesn't reinit the FPU state
|
||||
memcpy(&cpuState->extended_registers, thread->arch_info.fpu_state,
|
||||
sizeof(cpuState->extended_registers));
|
||||
#ifndef __x86_64__
|
||||
} else {
|
||||
x86_fnsave(&cpuState->extended_registers);
|
||||
x86_frstor(&cpuState->extended_registers);
|
||||
// fnsave reinits the FPU state after saving, so we need to
|
||||
// load it again
|
||||
// TODO: Convert to fxsave format!
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
get_iframe_registers(frame, cpuState);
|
||||
}
|
||||
}
|
||||
|
@ -70,6 +70,7 @@ dummy()
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, r8);
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, r9);
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, r10);
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, fpu);
|
||||
#else
|
||||
DEFINE_OFFSET_MACRO(IFRAME, iframe, orig_eax);
|
||||
#endif
|
||||
@ -79,12 +80,6 @@ dummy()
|
||||
DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, function);
|
||||
DEFINE_OFFSET_MACRO(SYSCALL_INFO, syscall_info, parameter_size);
|
||||
|
||||
// struct x86_optimized_functions
|
||||
DEFINE_OFFSET_MACRO(X86_OPTIMIZED_FUNCTIONS, x86_optimized_functions,
|
||||
memcpy);
|
||||
DEFINE_OFFSET_MACRO(X86_OPTIMIZED_FUNCTIONS, x86_optimized_functions,
|
||||
memset);
|
||||
|
||||
// struct signal_frame_data
|
||||
DEFINE_SIZEOF_MACRO(SIGNAL_FRAME_DATA, signal_frame_data);
|
||||
DEFINE_OFFSET_MACRO(SIGNAL_FRAME_DATA, signal_frame_data, info);
|
||||
|
@ -6,22 +6,12 @@
|
||||
* Distributed under the terms of the NewOS License.
|
||||
*/
|
||||
|
||||
#if !_BOOT_MODE
|
||||
# include "asm_offsets.h"
|
||||
#endif
|
||||
|
||||
#include <asm_defs.h>
|
||||
|
||||
|
||||
// We don't need the indirection in the boot loader.
|
||||
#if _BOOT_MODE
|
||||
# define memcpy_generic memcpy
|
||||
# define memset_generic memset
|
||||
#endif
|
||||
|
||||
|
||||
.align 4
|
||||
FUNCTION(memcpy_generic):
|
||||
FUNCTION(memcpy):
|
||||
pushl %esi
|
||||
pushl %edi
|
||||
movl 12(%esp),%edi /* dest */
|
||||
@ -45,13 +35,13 @@ FUNCTION(memcpy_generic):
|
||||
popl %edi
|
||||
popl %esi
|
||||
ret
|
||||
FUNCTION_END(memcpy_generic)
|
||||
SYMBOL(memcpy_generic_end):
|
||||
FUNCTION_END(memcpy)
|
||||
SYMBOL(memcpy_end):
|
||||
|
||||
|
||||
/* void *memset(void *dest, int value, size_t length); */
|
||||
.align 4
|
||||
FUNCTION(memset_generic):
|
||||
FUNCTION(memset):
|
||||
push %ebp
|
||||
mov %esp, %ebp
|
||||
|
||||
@ -111,19 +101,6 @@ FUNCTION(memset_generic):
|
||||
mov %ebp, %esp
|
||||
pop %ebp
|
||||
ret
|
||||
FUNCTION_END(memset_generic)
|
||||
SYMBOL(memset_generic_end):
|
||||
|
||||
|
||||
#if !_BOOT_MODE
|
||||
|
||||
.align 4
|
||||
FUNCTION(memcpy):
|
||||
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memcpy)
|
||||
FUNCTION_END(memcpy)
|
||||
|
||||
FUNCTION(memset):
|
||||
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memset)
|
||||
FUNCTION_END(memset)
|
||||
SYMBOL(memset_end):
|
||||
|
||||
#endif // !_BOOT_MODE
|
||||
|
@ -21,6 +21,7 @@ KernelMergeObject kernel_os_arch_$(TARGET_ARCH).o :
|
||||
;
|
||||
|
||||
SEARCH_SOURCE += [ FDirName $(posixSources) arch $(TARGET_ARCH) ] ;
|
||||
SEARCH_SOURCE += [ FDirName $(posixSources) string arch $(TARGET_ARCH) ] ;
|
||||
|
||||
KernelMergeObject kernel_lib_posix_arch_$(TARGET_ARCH).o :
|
||||
siglongjmp.S
|
||||
@ -28,12 +29,8 @@ KernelMergeObject kernel_lib_posix_arch_$(TARGET_ARCH).o :
|
||||
kernel_longjmp_return.c
|
||||
kernel_setjmp_save_sigs.c
|
||||
|
||||
arch_string.S
|
||||
arch_string.cpp
|
||||
|
||||
: $(TARGET_KERNEL_PIC_CCFLAGS)
|
||||
;
|
||||
|
||||
# Explicitly tell the build system that arch_string.S includes the generated
|
||||
# asm_offsets.h.
|
||||
Includes [ FGristFiles arch_string.S ]
|
||||
: <src!system!kernel!arch!x86>asm_offsets.h ;
|
||||
|
@ -1,96 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012, Alex Smith, alex@alex-smith.me.uk.
|
||||
* Distributed under the terms of the MIT License.
|
||||
*/
|
||||
|
||||
|
||||
#include <asm_defs.h>
|
||||
|
||||
#include "asm_offsets.h"
|
||||
|
||||
|
||||
.align 8
|
||||
FUNCTION(memcpy_generic):
|
||||
push %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
// Preserve original destination address for return value.
|
||||
movq %rdi, %rax
|
||||
|
||||
// size -> %rcx
|
||||
movq %rdx, %rcx
|
||||
|
||||
// For small copies, always do it bytewise, the additional overhead is
|
||||
// not worth it.
|
||||
cmp $24, %rcx
|
||||
jl .Lmemcpy_generic_byte_copy
|
||||
|
||||
// Do both source and dest have the same alignment?
|
||||
movq %rsi, %r8
|
||||
xorq %rdi, %r8
|
||||
test $7, %r8
|
||||
jnz .Lmemcpy_generic_byte_copy
|
||||
|
||||
// Align up to an 8-byte boundary.
|
||||
movq %rdi, %r8
|
||||
andq $7, %r8
|
||||
jz .Lmemcpy_generic_qword_copy
|
||||
movq $8, %rcx
|
||||
subq %r8, %rcx
|
||||
subq %rcx, %rdx // Subtract from the overall count.
|
||||
rep
|
||||
movsb
|
||||
|
||||
// Get back the original count value.
|
||||
movq %rdx, %rcx
|
||||
.Lmemcpy_generic_qword_copy:
|
||||
// Move by quadwords.
|
||||
shrq $3, %rcx
|
||||
rep
|
||||
movsq
|
||||
|
||||
// Get the remaining count.
|
||||
movq %rdx, %rcx
|
||||
andq $7, %rcx
|
||||
.Lmemcpy_generic_byte_copy:
|
||||
// Move any remaining data by bytes.
|
||||
rep
|
||||
movsb
|
||||
|
||||
pop %rbp
|
||||
ret
|
||||
FUNCTION_END(memcpy_generic)
|
||||
SYMBOL(memcpy_generic_end):
|
||||
|
||||
|
||||
.align 8
|
||||
FUNCTION(memset_generic):
|
||||
push %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
// Preserve original destination address for return value.
|
||||
movq %rdi, %r8
|
||||
|
||||
// size -> %rcx, value -> %al
|
||||
movq %rdx, %rcx
|
||||
movl %esi, %eax
|
||||
|
||||
// Move by bytes.
|
||||
rep
|
||||
stosb
|
||||
|
||||
movq %r8, %rax
|
||||
pop %rbp
|
||||
ret
|
||||
FUNCTION_END(memset_generic)
|
||||
SYMBOL(memset_generic_end):
|
||||
|
||||
|
||||
FUNCTION(memcpy):
|
||||
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memcpy)
|
||||
FUNCTION_END(memcpy)
|
||||
|
||||
FUNCTION(memset):
|
||||
jmp *(gOptimizedFunctions + X86_OPTIMIZED_FUNCTIONS_memset)
|
||||
FUNCTION_END(memset)
|
||||
|
@ -1,5 +1,7 @@
|
||||
SubDir HAIKU_TOP src system libroot posix string arch x86_64 ;
|
||||
|
||||
SubDirC++Flags -std=gnu++11 ;
|
||||
|
||||
local architectureObject ;
|
||||
for architectureObject in [ MultiArchSubDirSetup x86_64 ] {
|
||||
on $(architectureObject) {
|
||||
@ -8,7 +10,7 @@ for architectureObject in [ MultiArchSubDirSetup x86_64 ] {
|
||||
UsePrivateSystemHeaders ;
|
||||
|
||||
MergeObject <$(architecture)>posix_string_arch_$(TARGET_ARCH).o :
|
||||
arch_string.S
|
||||
arch_string.cpp
|
||||
;
|
||||
}
|
||||
}
|
||||
|
@ -1,22 +0,0 @@
|
||||
/*
|
||||
* Copyright 2008, Ingo Weinhold, ingo_weinhold@gmx.de.
|
||||
* Distributed under the terms of the MIT License.
|
||||
*/
|
||||
|
||||
#include <asm_defs.h>
|
||||
#include <commpage_defs.h>
|
||||
|
||||
|
||||
FUNCTION(memcpy):
|
||||
movq __gCommPageAddress@GOTPCREL(%rip), %rax
|
||||
movq (%rax), %rax
|
||||
addq 8 * COMMPAGE_ENTRY_X86_MEMCPY(%rax), %rax
|
||||
jmp *%rax
|
||||
FUNCTION_END(memcpy)
|
||||
|
||||
FUNCTION(memset):
|
||||
movq __gCommPageAddress@GOTPCREL(%rip), %rax
|
||||
movq (%rax), %rax
|
||||
addq 8 * COMMPAGE_ENTRY_X86_MEMSET(%rax), %rax
|
||||
jmp *%rax
|
||||
FUNCTION_END(memset)
|
216
src/system/libroot/posix/string/arch/x86_64/arch_string.cpp
Normal file
216
src/system/libroot/posix/string/arch/x86_64/arch_string.cpp
Normal file
@ -0,0 +1,216 @@
|
||||
/*
|
||||
* Copyright 2014, Paweł Dziepak, pdziepak@quarnos.org.
|
||||
* Distributed under the terms of the MIT License.
|
||||
*/
|
||||
|
||||
|
||||
#include <array>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#include <x86intrin.h>
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
template<template<size_t N> class Generator, unsigned N, unsigned ...Index>
|
||||
struct GenerateTable : GenerateTable<Generator, N - 1, N - 1, Index...> {
|
||||
};
|
||||
|
||||
template<template<size_t N> class Generator, unsigned ...Index>
|
||||
struct GenerateTable<Generator, 0, Index...>
|
||||
: std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
|
||||
constexpr GenerateTable()
|
||||
:
|
||||
std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
|
||||
{ Generator<Index>::sValue... }
|
||||
}
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source,
|
||||
size_t length)
|
||||
{
|
||||
__asm__ __volatile__("rep movsb"
|
||||
: "+D" (destination), "+S" (source), "+c" (length)
|
||||
:
|
||||
: "memory");
|
||||
}
|
||||
|
||||
|
||||
template<size_t N>
|
||||
inline void copy_small(uint8_t* destination, const uint8_t* source)
|
||||
{
|
||||
struct data {
|
||||
uint8_t x[N];
|
||||
};
|
||||
*reinterpret_cast<data*>(destination)
|
||||
= *reinterpret_cast<const data*>(source);
|
||||
}
|
||||
|
||||
|
||||
template<size_t N>
|
||||
struct SmallGenerator {
|
||||
constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>;
|
||||
};
|
||||
constexpr static GenerateTable<SmallGenerator, 8> table_small;
|
||||
|
||||
|
||||
static inline void memcpy_small(uint8_t* destination, const uint8_t* source,
|
||||
size_t length)
|
||||
{
|
||||
if (length < 8) {
|
||||
table_small[length](destination, source);
|
||||
} else {
|
||||
auto to = reinterpret_cast<uint64_t*>(destination);
|
||||
auto from = reinterpret_cast<const uint64_t*>(source);
|
||||
*to = *from;
|
||||
to = reinterpret_cast<uint64_t*>(destination + length - 8);
|
||||
from = reinterpret_cast<const uint64_t*>(source + length - 8);
|
||||
*to = *from;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<size_t N>
|
||||
inline void copy_sse(__m128i* destination, const __m128i* source)
|
||||
{
|
||||
auto temp = _mm_loadu_si128(source);
|
||||
_mm_storeu_si128(destination, temp);
|
||||
copy_sse<N - 1>(destination + 1, source + 1);
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline void copy_sse<0>(__m128i* destination, const __m128i* source)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
template<size_t N>
|
||||
struct SSEGenerator {
|
||||
constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>;
|
||||
};
|
||||
constexpr static GenerateTable<SSEGenerator, 4> table_sse;
|
||||
|
||||
|
||||
static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length)
|
||||
{
|
||||
auto to = reinterpret_cast<__m128i*>(destination);
|
||||
auto from = reinterpret_cast<const __m128i*>(source);
|
||||
auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16);
|
||||
auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16);
|
||||
while (length >= 64) {
|
||||
copy_sse<4>(to, from);
|
||||
to += 4;
|
||||
from += 4;
|
||||
length -= 64;
|
||||
}
|
||||
if (length >= 16) {
|
||||
table_sse[length / 16](to, from);
|
||||
length %= 16;
|
||||
}
|
||||
if (length) {
|
||||
copy_sse<1>(toEnd, fromEnd);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
extern "C" void* memcpy(void* destination, const void* source, size_t length)
|
||||
{
|
||||
auto to = static_cast<uint8_t*>(destination);
|
||||
auto from = static_cast<const uint8_t*>(source);
|
||||
if (length <= 16) {
|
||||
memcpy_small(to, from, length);
|
||||
return destination;
|
||||
}
|
||||
if (length < 2048) {
|
||||
memcpy_sse(to, from, length);
|
||||
return destination;
|
||||
}
|
||||
memcpy_repmovs(to, from, length);
|
||||
return destination;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
memset_repstos(uint8_t* destination, uint8_t value, size_t length)
|
||||
{
|
||||
__asm__ __volatile__("rep stosb"
|
||||
: "+D" (destination), "+c" (length)
|
||||
: "a" (value)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
memset_sse(uint8_t* destination, uint8_t value, size_t length)
|
||||
{
|
||||
__m128i packed = _mm_set1_epi8(value);
|
||||
auto end = reinterpret_cast<__m128i*>(destination + length - 16);
|
||||
auto diff = reinterpret_cast<uintptr_t>(destination) % 16;
|
||||
if (diff) {
|
||||
diff = 16 - diff;
|
||||
length -= diff;
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed);
|
||||
}
|
||||
auto ptr = reinterpret_cast<__m128i*>(destination + diff);
|
||||
while (length >= 64) {
|
||||
_mm_store_si128(ptr++, packed);
|
||||
_mm_store_si128(ptr++, packed);
|
||||
_mm_store_si128(ptr++, packed);
|
||||
_mm_store_si128(ptr++, packed);
|
||||
length -= 64;
|
||||
}
|
||||
while (length >= 16) {
|
||||
_mm_store_si128(ptr++, packed);
|
||||
length -= 16;
|
||||
}
|
||||
_mm_storeu_si128(end, packed);
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
memset_small(uint8_t* destination, uint8_t value, size_t length)
|
||||
{
|
||||
if (length >= 8) {
|
||||
auto packed = value * 0x101010101010101ul;
|
||||
auto ptr = reinterpret_cast<uint64_t*>(destination);
|
||||
auto end = reinterpret_cast<uint64_t*>(destination + length - 8);
|
||||
while (length >= 8) {
|
||||
*ptr++ = packed;
|
||||
length -= 8;
|
||||
}
|
||||
*end = packed;
|
||||
} else {
|
||||
while (length--) {
|
||||
*destination++ = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern "C" void*
|
||||
memset(void* ptr, int chr, size_t length)
|
||||
{
|
||||
auto value = static_cast<unsigned char>(chr);
|
||||
auto destination = static_cast<uint8_t*>(ptr);
|
||||
if (length < 32) {
|
||||
memset_small(destination, value, length);
|
||||
return ptr;
|
||||
}
|
||||
if (length < 2048) {
|
||||
memset_sse(destination, value, length);
|
||||
return ptr;
|
||||
}
|
||||
memset_repstos(destination, value, length);
|
||||
return ptr;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user