diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml index 50cf22ba6b..8eb4f538d5 100644 --- a/doc/src/sgml/ref/pgbench.sgml +++ b/doc/src/sgml/ref/pgbench.sgml @@ -1057,7 +1057,7 @@ pgbench options d default_seed - seed used in hash functions by default + seed used in hash and pseudorandom permutation functions by default @@ -1864,6 +1864,24 @@ SELECT 4 AS four \; SELECT 5 AS five \aset + + + permute ( i, size [, seed ] ) + integer + + + Permuted value of i, in the range + [0, size). This is the new position of + i (modulo size) in a + pseudorandom permutation of the integers 0...size-1, + parameterized by seed, see below. + + + permute(0, 4) + an integer between 0 and 3 + + + pi () @@ -2071,29 +2089,70 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) / + + + When designing a benchmark which selects rows non-uniformly, be aware + that the rows chosen may be correlated with other data such as IDs from + a sequence or the physical row ordering, which may skew performance + measurements. + + + To avoid this, you may wish to use the permute + function, or some other additional step with similar effect, to shuffle + the selected rows and remove such correlations. + + + Hash functions hash, hash_murmur2 and hash_fnv1a accept an input value and an optional seed parameter. In case the seed isn't provided the value of :default_seed is used, which is initialized randomly unless set by the command-line - -D option. Hash functions can be used to scatter the - distribution of random functions such as random_zipfian or - random_exponential. For instance, the following pgbench - script simulates possible real world workload typical for social media and - blogging platforms where few accounts generate excessive load: + -D option. + + + + permute accepts an input value, a size, and an optional + seed parameter. It generates a pseudorandom permutation of integers in + the range [0, size), and returns the index of the input + value in the permuted values. The permutation chosen is parameterized by + the seed, which defaults to :default_seed, if not + specified. Unlike the hash functions, permute ensures + that there are no collisions or holes in the output values. Input values + outside the interval are interpreted modulo the size. The function raises + an error if the size is not positive. permute can be + used to scatter the distribution of non-uniform random functions such as + random_zipfian or random_exponential + so that values drawn more often are not trivially correlated. For + instance, the following pgbench script + simulates a possible real world workload typical for social media and + blogging platforms where a few accounts generate excessive load: -\set r random_zipfian(0, 100000000, 1.07) -\set k abs(hash(:r)) % 1000000 +\set size 1000000 +\set r random_zipfian(1, :size, 1.07) +\set k 1 + permute(:r, :size) In some cases several distinct distributions are needed which don't correlate - with each other and this is when implicit seed parameter comes in handy: + with each other and this is when the optional seed parameter comes in handy: -\set k1 abs(hash(:r, :default_seed + 123)) % 1000000 -\set k2 abs(hash(:r, :default_seed + 321)) % 1000000 +\set k1 1 + permute(:r, :size, :default_seed + 123) +\set k2 1 + permute(:r, :size, :default_seed + 321) + + A similar behavior can also be approximated with hash: + + +\set size 1000000 +\set r random_zipfian(1, 100 * :size, 1.07) +\set k 1 + abs(hash(:r)) % :size + + + However, since hash generates collisions, some values + will not be reachable and others will be more frequent than expected from + the original distribution. diff --git a/src/bin/pgbench/exprparse.y b/src/bin/pgbench/exprparse.y index 4d529ea550..56f75ccd25 100644 --- a/src/bin/pgbench/exprparse.y +++ b/src/bin/pgbench/exprparse.y @@ -19,6 +19,7 @@ #define PGBENCH_NARGS_VARIABLE (-1) #define PGBENCH_NARGS_CASE (-2) #define PGBENCH_NARGS_HASH (-3) +#define PGBENCH_NARGS_PERMUTE (-4) PgBenchExpr *expr_parse_result; @@ -370,6 +371,9 @@ static const struct { "hash_fnv1a", PGBENCH_NARGS_HASH, PGBENCH_HASH_FNV1A }, + { + "permute", PGBENCH_NARGS_PERMUTE, PGBENCH_PERMUTE + }, /* keep as last array element */ { NULL, 0, 0 @@ -482,6 +486,19 @@ make_func(yyscan_t yyscanner, int fnumber, PgBenchExprList *args) } break; + /* pseudorandom permutation function with optional seed argument */ + case PGBENCH_NARGS_PERMUTE: + if (len < 2 || len > 3) + expr_yyerror_more(yyscanner, "unexpected number of arguments", + PGBENCH_FUNCTIONS[fnumber].fname); + + if (len == 2) + { + PgBenchExpr *var = make_variable("default_seed"); + args = make_elist(var, args); + } + break; + /* common case: positive arguments number */ default: Assert(PGBENCH_FUNCTIONS[fnumber].nargs >= 0); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 48ce1712cc..da1d9ec535 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -66,6 +66,7 @@ #include "getopt_long.h" #include "libpq-fe.h" #include "pgbench.h" +#include "port/pg_bitutils.h" #include "portability/instr_time.h" #ifndef M_PI @@ -1127,6 +1128,113 @@ getHashMurmur2(int64 val, uint64 seed) return (int64) result; } +/* + * Pseudorandom permutation function + * + * For small sizes, this generates each of the (size!) possible permutations + * of integers in the range [0, size) with roughly equal probability. Once + * the size is larger than 20, the number of possible permutations exceeds the + * number of distinct states of the internal pseudorandom number generators, + * and so not all possible permutations can be generated, but the permutations + * chosen should continue to give the appearance of being random. + * + * THIS FUNCTION IS NOT CRYPTOGRAPHICALLY SECURE. + * DO NOT USE FOR SUCH PURPOSE. + */ +static int64 +permute(const int64 val, const int64 isize, const int64 seed) +{ + RandomState random_state1; + RandomState random_state2; + uint64 size; + uint64 v; + int masklen; + uint64 mask; + int i; + + if (isize < 2) + return 0; /* nothing to permute */ + + /* Initialize a pair of random states using the seed */ + random_state1.xseed[0] = seed & 0xFFFF; + random_state1.xseed[1] = (seed >> 16) & 0xFFFF; + random_state1.xseed[2] = (seed >> 32) & 0xFFFF; + + random_state2.xseed[0] = (((uint64) seed) >> 48) & 0xFFFF; + random_state2.xseed[1] = seed & 0xFFFF; + random_state2.xseed[2] = (seed >> 16) & 0xFFFF; + + /* Computations are performed on unsigned values */ + size = (uint64) isize; + v = (uint64) val % size; + + /* Mask to work modulo largest power of 2 less than or equal to size */ + masklen = pg_leftmost_one_pos64(size); + mask = (((uint64) 1) << masklen) - 1; + + /* + * Permute the input value by applying several rounds of pseudorandom + * bijective transformations. The intention here is to distribute each + * input uniformly randomly across the range, and separate adjacent inputs + * approximately uniformly randomly from each other, leading to a fairly + * random overall choice of permutation. + * + * To separate adjacent inputs, we multiply by a random number modulo + * (mask + 1), which is a power of 2. For this to be a bijection, the + * multiplier must be odd. Since this is known to lead to less randomness + * in the lower bits, we also apply a rotation that shifts the topmost bit + * into the least significant bit. In the special cases where size <= 3, + * mask = 1 and each of these operations is actually a no-op, so we also + * XOR the value with a different random number to inject additional + * randomness. Since the size is generally not a power of 2, we apply + * this bijection on overlapping upper and lower halves of the input. + * + * To distribute the inputs uniformly across the range, we then also apply + * a random offset modulo the full range. + * + * Taken together, these operations resemble a modified linear + * congruential generator, as is commonly used in pseudorandom number + * generators. The number of rounds is fairly arbitrary, but six has been + * found empirically to give a fairly good tradeoff between performance + * and uniform randomness. For small sizes it selects each of the (size!) + * possible permutations with roughly equal probability. For larger + * sizes, not all permutations can be generated, but the intended random + * spread is still produced. + */ + for (i = 0; i < 6; i++) + { + uint64 m, + r, + t; + + /* Random multiply (by an odd number), XOR and rotate of lower half */ + m = (uint64) getrand(&random_state1, 0, mask) | 1; + r = (uint64) getrand(&random_state2, 0, mask); + if (v <= mask) + { + v = ((v * m) ^ r) & mask; + v = ((v << 1) & mask) | (v >> (masklen - 1)); + } + + /* Random multiply (by an odd number), XOR and rotate of upper half */ + m = (uint64) getrand(&random_state1, 0, mask) | 1; + r = (uint64) getrand(&random_state2, 0, mask); + t = size - 1 - v; + if (t <= mask) + { + t = ((t * m) ^ r) & mask; + t = ((t << 1) & mask) | (t >> (masklen - 1)); + v = size - 1 - t; + } + + /* Random offset */ + r = (uint64) getrand(&random_state2, 0, size - 1); + v = (v + r) % size; + } + + return (int64) v; +} + /* * Initialize the given SimpleStats struct to all zeroes */ @@ -2475,6 +2583,29 @@ evalStandardFunc(CState *st, return true; } + case PGBENCH_PERMUTE: + { + int64 val, + size, + seed; + + Assert(nargs == 3); + + if (!coerceToInt(&vargs[0], &val) || + !coerceToInt(&vargs[1], &size) || + !coerceToInt(&vargs[2], &seed)) + return false; + + if (size <= 0) + { + pg_log_error("permute size parameter must be greater than zero"); + return false; + } + + setIntValue(retval, permute(val, size, seed)); + return true; + } + default: /* cannot get here */ Assert(0); diff --git a/src/bin/pgbench/pgbench.h b/src/bin/pgbench/pgbench.h index 3a9d89e6f1..6ce1c98649 100644 --- a/src/bin/pgbench/pgbench.h +++ b/src/bin/pgbench/pgbench.h @@ -99,7 +99,8 @@ typedef enum PgBenchFunction PGBENCH_IS, PGBENCH_CASE, PGBENCH_HASH_FNV1A, - PGBENCH_HASH_MURMUR2 + PGBENCH_HASH_MURMUR2, + PGBENCH_PERMUTE } PgBenchFunction; typedef struct PgBenchExpr PgBenchExpr; diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl index 82a46c72b6..c2482dea17 100644 --- a/src/bin/pgbench/t/001_pgbench_with_server.pl +++ b/src/bin/pgbench/t/001_pgbench_with_server.pl @@ -4,6 +4,7 @@ use warnings; use PostgresNode; use TestLib; use Test::More; +use Config; # start a pgbench specific server my $node = get_new_node('main'); @@ -483,6 +484,17 @@ pgbench( qr{command=98.: int 5432\b}, # :random_seed qr{command=99.: int -9223372036854775808\b}, # min int qr{command=100.: int 9223372036854775807\b}, # max int + # pseudorandom permutation tests + qr{command=101.: boolean true\b}, + qr{command=102.: boolean true\b}, + qr{command=103.: boolean true\b}, + qr{command=104.: boolean true\b}, + qr{command=105.: boolean true\b}, + qr{command=109.: boolean true\b}, + qr{command=110.: boolean true\b}, + qr{command=111.: boolean true\b}, + qr{command=112.: int 9223372036854775797\b}, + qr{command=113.: boolean true\b}, ], 'pgbench expressions', { @@ -610,6 +622,33 @@ SELECT :v0, :v1, :v2, :v3; -- minint constant parsing \set min debug(-9223372036854775808) \set max debug(-(:min + 1)) +-- parametric pseudorandom permutation function +\set t debug(permute(0, 2) + permute(1, 2) = 1) +\set t debug(permute(0, 3) + permute(1, 3) + permute(2, 3) = 3) +\set t debug(permute(0, 4) + permute(1, 4) + permute(2, 4) + permute(3, 4) = 6) +\set t debug(permute(0, 5) + permute(1, 5) + permute(2, 5) + permute(3, 5) + permute(4, 5) = 10) +\set t debug(permute(0, 16) + permute(1, 16) + permute(2, 16) + permute(3, 16) + \ + permute(4, 16) + permute(5, 16) + permute(6, 16) + permute(7, 16) + \ + permute(8, 16) + permute(9, 16) + permute(10, 16) + permute(11, 16) + \ + permute(12, 16) + permute(13, 16) + permute(14, 16) + permute(15, 16) = 120) +-- random sanity checks +\set size random(2, 1000) +\set v random(0, :size - 1) +\set p permute(:v, :size) +\set t debug(0 <= :p and :p < :size and :p = permute(:v + :size, :size) and :p <> permute(:v + 1, :size)) +-- actual values +\set t debug(permute(:v, 1) = 0) +\set t debug(permute(0, 2, 5432) = 0 and permute(1, 2, 5432) = 1 and \ + permute(0, 2, 5435) = 1 and permute(1, 2, 5435) = 0) +-- 63 bits tests +\set size debug(:max - 10) +\set t debug(permute(:size-1, :size, 5432) = 5301702756001087507 and \ + permute(:size-2, :size, 5432) = 8968485976055840695 and \ + permute(:size-3, :size, 5432) = 6708495591295582115 and \ + permute(:size-4, :size, 5432) = 2801794404574855121 and \ + permute(:size-5, :size, 5432) = 1489011409218895840 and \ + permute(:size-6, :size, 5432) = 2267749475878240183 and \ + permute(:size-7, :size, 5432) = 1300324176838786780) } }); @@ -1048,6 +1087,10 @@ SELECT LEAST(} . join(', ', (':i') x 256) . q{)} 'bad boolean', 2, [qr{malformed variable.*trueXXX}], q{\set b :badtrue or true} ], + [ + 'invalid permute size', 2, + [qr{permute size parameter must be greater than zero}], q{\set i permute(0, 0)} + ], # GSET [ diff --git a/src/bin/pgbench/t/002_pgbench_no_server.pl b/src/bin/pgbench/t/002_pgbench_no_server.pl index e38c7d77d1..4027e68dfa 100644 --- a/src/bin/pgbench/t/002_pgbench_no_server.pl +++ b/src/bin/pgbench/t/002_pgbench_no_server.pl @@ -341,6 +341,16 @@ my @script_tests = ( 'set i', [ qr{set i 1 }, qr{\^ error found here} ], { 'set_i_op' => "\\set i 1 +\n" } + ], + [ + 'not enough arguments to permute', + [qr{unexpected number of arguments \(permute\)}], + { 'bad-permute-1.sql' => "\\set i permute(1)\n" } + ], + [ + 'too many arguments to permute', + [qr{unexpected number of arguments \(permute\)}], + { 'bad-permute-2.sql' => "\\set i permute(1, 2, 3, 4)\n" } ],); for my $t (@script_tests)