From 0ce38730ac72029f3f2c95ae80b44f5b9060cbcc Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 28 Dec 2019 17:21:17 -0500 Subject: [PATCH] Micro-optimize AllocSetFreeIndex() by reference to pg_bitutils code. Use __builtin_clz() where available. Where it isn't, we can still win a little by using the pg_leftmost_one_pos[] lookup table instead of having a private table. Also drop the initial right shift by ALLOC_MINBITS in favor of subtracting ALLOC_MINBITS from the leftmost-one-pos result. This is a win because the compiler can fold that adjustment into other constants it'd have to add anyway, making the shift-removal free. Also, we can explain this coding as an unrolled form of pg_leftmost_one_pos32(), even though that's a bit ahistorical since it long predates pg_bitutils.h. John Naylor, with some cosmetic adjustments by me Discussion: https://postgr.es/m/CACPNZCuNUGMxjK7WTn_=WZnRbfASDdBxmjsVf2+m9MdmeNw_sg@mail.gmail.com --- src/backend/utils/mmgr/aset.c | 56 +++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index f729d9b6de..5ac3811212 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -46,6 +46,7 @@ #include "postgres.h" +#include "port/pg_bitutils.h" #include "utils/memdebug.h" #include "utils/memutils.h" @@ -297,18 +298,6 @@ static const MemoryContextMethods AllocSetMethods = { #endif }; -/* - * Table for AllocSetFreeIndex - */ -#define LT16(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -static const unsigned char LogTable256[256] = -{ - 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, - LT16(5), LT16(6), LT16(6), LT16(7), LT16(7), LT16(7), LT16(7), - LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8) -}; - /* ---------- * Debug macros * ---------- @@ -337,24 +326,41 @@ static inline int AllocSetFreeIndex(Size size) { int idx; - unsigned int t, - tsize; if (size > (1 << ALLOC_MINBITS)) { - tsize = (size - 1) >> ALLOC_MINBITS; - - /* - * At this point we need to obtain log2(tsize)+1, ie, the number of - * not-all-zero bits at the right. We used to do this with a - * shift-and-count loop, but this function is enough of a hotspot to - * justify micro-optimization effort. The best approach seems to be - * to use a lookup table. Note that this code assumes that - * ALLOCSET_NUM_FREELISTS <= 17, since we only cope with two bytes of - * the tsize value. + /*---------- + * At this point we must compute ceil(log2(size >> ALLOC_MINBITS)). + * This is the same as + * pg_leftmost_one_pos32((size - 1) >> ALLOC_MINBITS) + 1 + * or equivalently + * pg_leftmost_one_pos32(size - 1) - ALLOC_MINBITS + 1 + * + * However, rather than just calling that function, we duplicate the + * logic here, allowing an additional optimization. It's reasonable + * to assume that ALLOC_CHUNK_LIMIT fits in 16 bits, so we can unroll + * the byte-at-a-time loop in pg_leftmost_one_pos32 and just handle + * the last two bytes. + * + * Yes, this function is enough of a hot-spot to make it worth this + * much trouble. + *---------- */ +#ifdef HAVE__BUILTIN_CLZ + idx = 31 - __builtin_clz((uint32) size - 1) - ALLOC_MINBITS + 1; +#else + uint32 t, + tsize; + + /* Statically assert that we only have a 16-bit input value. */ + StaticAssertStmt(ALLOC_CHUNK_LIMIT < (1 << 16), + "ALLOC_CHUNK_LIMIT must be less than 64kB"); + + tsize = size - 1; t = tsize >> 8; - idx = t ? LogTable256[t] + 8 : LogTable256[tsize]; + idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize]; + idx -= ALLOC_MINBITS - 1; +#endif Assert(idx < ALLOCSET_NUM_FREELISTS); }