From 1d7b716f84b6cbed439a33aa4df78f3b0dfc279b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Dziepak?= Date: Sun, 7 Sep 2014 21:43:28 +0200 Subject: [PATCH] libroot/x86_64: new memset implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces new memset() implementation that improves the performance when the buffer is small. It was written for processors that support ERMSB, but performs reasonably well on older CPUs as well. The following benchmarks were done on Haswell i7 running Debian Jessie with Linux 3.16.1. In each iteration 64MB buffer was memset()ed, the parameter "size" is the size of the buffer passed in a single call (i.e. for "size: 2" memset() was called ~32 million times to memset the whole 64MB). f - original implementation, g - new implementation, all buffers 16 byte aligned set, size: 8, f: 66885 µs, g: 17768 µs, ∆: 73.44% set, size: 32, f: 17123 µs, g: 9163 µs, ∆: 46.49% set, size: 128, f: 6677 µs, g: 6919 µs, ∆: -3.62% set, size: 512, f: 11656 µs, g: 7715 µs, ∆: 33.81% set, size: 1024, f: 9156 µs, g: 7359 µs, ∆: 19.63% set, size: 4096, f: 4936 µs, g: 5159 µs, ∆: -4.52% f - glibc 2.19 implementation, g - new implementation, all buffers 16 byte aligned set, size: 8, f: 19631 µs, g: 17828 µs, ∆: 9.18% set, size: 32, f: 8545 µs, g: 9047 µs, ∆: -5.87% set, size: 128, f: 8304 µs, g: 6874 µs, ∆: 17.22% set, size: 512, f: 7373 µs, g: 7486 µs, ∆: -1.53% set, size: 1024, f: 9007 µs, g: 7344 µs, ∆: 18.46% set, size: 4096, f: 8169 µs, g: 5146 µs, ∆: 37.01% Apparently, glibc uses SSE even for large buffers and therefore does not takes advantage of ERMSB: set, size: 16384, f: 7007 µs, g: 3223 µs, ∆: 54.00% set, size: 32768, f: 6979 µs, g: 2930 µs, ∆: 58.02% set, size: 65536, f: 6907 µs, g: 2826 µs, ∆: 59.08% set, size: 131072, f: 6919 µs, g: 2752 µs, ∆: 60.23% The new implementation handles unaligned buffers quite well: f - glibc 2.19 implementation, g - new implementation, all buffers unaligned set, size: 16, f: 10045 µs, g: 10498 µs, ∆: -4.51% set, size: 32, f: 8590 µs, g: 9358 µs, ∆: -8.94% set, size: 64, f: 8618 µs, g: 8585 µs, ∆: 0.38% set, size: 128, f: 8393 µs, g: 6893 µs, ∆: 17.87% set, size: 256, f: 8042 µs, g: 7621 µs, ∆: 5.24% set, size: 512, f: 9661 µs, g: 7738 µs, ∆: 19.90% Signed-off-by: Paweł Dziepak --- .../posix/string/arch/x86_64/arch_string.cpp | 74 ++++++++++++++++++- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp index b83376c331..33fca22474 100644 --- a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp +++ b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp @@ -5,6 +5,9 @@ #include +#include + +#include extern "C" void* @@ -18,14 +21,77 @@ memcpy(void* destination, const void* source, size_t length) } -extern "C" void* -memset(void* destination, int value, size_t length) +static inline void +memset_repstos(uint8_t* destination, uint8_t value, size_t length) { - auto returnValue = destination; __asm__ __volatile__("rep stosb" : "+D" (destination), "+c" (length) : "a" (value) : "memory"); - return returnValue; +} + + +static inline void +memset_sse(uint8_t* destination, uint8_t value, size_t length) +{ + __m128i packed = _mm_set1_epi8(value); + auto end = reinterpret_cast<__m128i*>(destination + length - 16); + auto diff = reinterpret_cast(destination) % 16; + if (diff) { + diff = 16 - diff; + length -= diff; + _mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed); + } + auto ptr = reinterpret_cast<__m128i*>(destination + diff); + while (length >= 64) { + _mm_store_si128(ptr++, packed); + _mm_store_si128(ptr++, packed); + _mm_store_si128(ptr++, packed); + _mm_store_si128(ptr++, packed); + length -= 64; + } + while (length >= 16) { + _mm_store_si128(ptr++, packed); + length -= 16; + } + _mm_storeu_si128(end, packed); +} + + +static inline void +memset_small(uint8_t* destination, uint8_t value, size_t length) +{ + if (length >= 8) { + auto packed = value * 0x101010101010101ul; + auto ptr = reinterpret_cast(destination); + auto end = reinterpret_cast(destination + length - 8); + while (length >= 8) { + *ptr++ = packed; + length -= 8; + } + *end = packed; + } else { + while (length--) { + *destination++ = value; + } + } +} + + +extern "C" void* +memset(void* ptr, int chr, size_t length) +{ + auto value = static_cast(chr); + auto destination = static_cast(ptr); + if (length < 32) { + memset_small(destination, value, length); + return ptr; + } + if (length < 2048) { + memset_sse(destination, value, length); + return ptr; + } + memset_repstos(destination, value, length); + return ptr; }