libroot/x86_64: new memset implementation

This patch introduces new memset() implementation that improves the
performance when the buffer is small. It was written for processors that
support ERMSB, but performs reasonably well on older CPUs as well.

The following benchmarks were done on Haswell i7 running Debian Jessie
with Linux 3.16.1. In each iteration 64MB buffer was memset()ed, the
parameter "size" is the size of the buffer passed in a single call (i.e.
for "size: 2" memset() was called ~32 million times to memset the whole
64MB).

f - original implementation, g - new implementation, all buffers 16 byte
aligned

set, size:        8, f:    66885 µs, g:    17768 µs, ∆:   73.44%
set, size:       32, f:    17123 µs, g:     9163 µs, ∆:   46.49%
set, size:      128, f:     6677 µs, g:     6919 µs, ∆:   -3.62%
set, size:      512, f:    11656 µs, g:     7715 µs, ∆:   33.81%
set, size:     1024, f:     9156 µs, g:     7359 µs, ∆:   19.63%
set, size:     4096, f:     4936 µs, g:     5159 µs, ∆:   -4.52%

f - glibc 2.19 implementation, g - new implementation, all buffers 16 byte
aligned

set, size:        8, f:    19631 µs, g:    17828 µs, ∆:    9.18%
set, size:       32, f:     8545 µs, g:     9047 µs, ∆:   -5.87%
set, size:      128, f:     8304 µs, g:     6874 µs, ∆:   17.22%
set, size:      512, f:     7373 µs, g:     7486 µs, ∆:   -1.53%
set, size:     1024, f:     9007 µs, g:     7344 µs, ∆:   18.46%
set, size:     4096, f:     8169 µs, g:     5146 µs, ∆:   37.01%

Apparently, glibc uses SSE even for large buffers and therefore does not
takes advantage of ERMSB:

set, size:    16384, f:     7007 µs, g:     3223 µs, ∆:   54.00%
set, size:    32768, f:     6979 µs, g:     2930 µs, ∆:   58.02%
set, size:    65536, f:     6907 µs, g:     2826 µs, ∆:   59.08%
set, size:   131072, f:     6919 µs, g:     2752 µs, ∆:   60.23%

The new implementation handles unaligned buffers quite well:

f - glibc 2.19 implementation, g - new implementation, all buffers unaligned

set, size:       16, f:    10045 µs, g:    10498 µs, ∆:   -4.51%
set, size:       32, f:     8590 µs, g:     9358 µs, ∆:   -8.94%
set, size:       64, f:     8618 µs, g:     8585 µs, ∆:    0.38%
set, size:      128, f:     8393 µs, g:     6893 µs, ∆:   17.87%
set, size:      256, f:     8042 µs, g:     7621 µs, ∆:    5.24%
set, size:      512, f:     9661 µs, g:     7738 µs, ∆:   19.90%

Signed-off-by: Paweł Dziepak <pdziepak@quarnos.org>
This commit is contained in:
Paweł Dziepak 2014-09-07 21:43:28 +02:00
parent 718fd007a6
commit 1d7b716f84
1 changed files with 70 additions and 4 deletions

View File

@ -5,6 +5,9 @@
#include <cstddef>
#include <cstdint>
#include <x86intrin.h>
extern "C" void*
@ -18,14 +21,77 @@ memcpy(void* destination, const void* source, size_t length)
}
extern "C" void*
memset(void* destination, int value, size_t length)
static inline void
memset_repstos(uint8_t* destination, uint8_t value, size_t length)
{
auto returnValue = destination;
__asm__ __volatile__("rep stosb"
: "+D" (destination), "+c" (length)
: "a" (value)
: "memory");
return returnValue;
}
static inline void
memset_sse(uint8_t* destination, uint8_t value, size_t length)
{
__m128i packed = _mm_set1_epi8(value);
auto end = reinterpret_cast<__m128i*>(destination + length - 16);
auto diff = reinterpret_cast<uintptr_t>(destination) % 16;
if (diff) {
diff = 16 - diff;
length -= diff;
_mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed);
}
auto ptr = reinterpret_cast<__m128i*>(destination + diff);
while (length >= 64) {
_mm_store_si128(ptr++, packed);
_mm_store_si128(ptr++, packed);
_mm_store_si128(ptr++, packed);
_mm_store_si128(ptr++, packed);
length -= 64;
}
while (length >= 16) {
_mm_store_si128(ptr++, packed);
length -= 16;
}
_mm_storeu_si128(end, packed);
}
static inline void
memset_small(uint8_t* destination, uint8_t value, size_t length)
{
if (length >= 8) {
auto packed = value * 0x101010101010101ul;
auto ptr = reinterpret_cast<uint64_t*>(destination);
auto end = reinterpret_cast<uint64_t*>(destination + length - 8);
while (length >= 8) {
*ptr++ = packed;
length -= 8;
}
*end = packed;
} else {
while (length--) {
*destination++ = value;
}
}
}
extern "C" void*
memset(void* ptr, int chr, size_t length)
{
auto value = static_cast<unsigned char>(chr);
auto destination = static_cast<uint8_t*>(ptr);
if (length < 32) {
memset_small(destination, value, length);
return ptr;
}
if (length < 2048) {
memset_sse(destination, value, length);
return ptr;
}
memset_repstos(destination, value, length);
return ptr;
}