From 888f2ea0a81ff171087bdd1c5c1eeda3b78d73d4 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 7 Apr 2023 11:47:07 -0400 Subject: [PATCH] Add array_sample() and array_shuffle() functions. These are useful in Monte Carlo applications. Martin Kalcher, reviewed/adjusted by Daniel Gustafsson and myself Discussion: https://postgr.es/m/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net --- doc/src/sgml/func.sgml | 44 ++++++- src/backend/utils/adt/array_userfuncs.c | 166 ++++++++++++++++++++++++ src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 6 + src/test/regress/expected/arrays.out | 54 ++++++++ src/test/regress/sql/arrays.sql | 14 ++ 6 files changed, 284 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index dc44a74eb2..4211d31f30 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -16053,7 +16053,7 @@ SELECT js, js IS JSON ARRAY "array?" FROM (VALUES ('123'), ('"abc"'), ('{"a": "b"}'), ('[1,2]'),('abc')) foo(js); - js | json? | scalar? | object? | array? + js | json? | scalar? | object? | array? ------------+-------+---------+---------+-------- 123 | t | t | f | f "abc" | t | t | f | f @@ -18777,6 +18777,48 @@ SELECT NULLIF(value, '(none)') ... + + + + array_sample + + array_sample ( array anyarray, n integer ) + anyarray + + + Returns an array of n items randomly selected + from array. n may not + exceed the length of array's first dimension. + If array is multi-dimensional, + an item is a slice having a given first subscript. + + + array_sample(ARRAY[1,2,3,4,5,6], 3) + {2,6,1} + + + array_sample(ARRAY[[1,2],[3,4],[5,6]], 2) + {{5,6},{1,2}} + + + + + + + array_shuffle + + array_shuffle ( anyarray ) + anyarray + + + Randomly shuffles the first dimension of the array. + + + array_shuffle(ARRAY[[1,2],[3,4],[5,6]]) + {{5,6},{1,2},{3,4}} + + + diff --git a/src/backend/utils/adt/array_userfuncs.c b/src/backend/utils/adt/array_userfuncs.c index 80750191d8..33e2b98307 100644 --- a/src/backend/utils/adt/array_userfuncs.c +++ b/src/backend/utils/adt/array_userfuncs.c @@ -15,6 +15,7 @@ #include "catalog/pg_type.h" #include "libpq/pqformat.h" #include "common/int.h" +#include "common/pg_prng.h" #include "port/pg_bitutils.h" #include "utils/array.h" #include "utils/datum.h" @@ -1525,3 +1526,168 @@ array_positions(PG_FUNCTION_ARGS) PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext)); } + +/* + * array_shuffle_n + * Return a copy of array with n randomly chosen items. + * + * The number of items must not exceed the size of the first dimension of the + * array. We preserve the first dimension's lower bound if keep_lb, + * else it's set to 1. Lower-order dimensions are preserved in any case. + * + * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info + * from the system catalogs, given only the elmtyp. However, the caller is + * in a better position to cache this info across multiple calls. + */ +static ArrayType * +array_shuffle_n(ArrayType *array, int n, bool keep_lb, + Oid elmtyp, TypeCacheEntry *typentry) +{ + ArrayType *result; + int ndim, + *dims, + *lbs, + nelm, + nitem, + rdims[MAXDIM], + rlbs[MAXDIM]; + int16 elmlen; + bool elmbyval; + char elmalign; + Datum *elms, + *ielms; + bool *nuls, + *inuls; + + ndim = ARR_NDIM(array); + dims = ARR_DIMS(array); + lbs = ARR_LBOUND(array); + + elmlen = typentry->typlen; + elmbyval = typentry->typbyval; + elmalign = typentry->typalign; + + /* If the target array is empty, exit fast */ + if (ndim < 1 || dims[0] < 1 || n < 1) + return construct_empty_array(elmtyp); + + deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign, + &elms, &nuls, &nelm); + + nitem = dims[0]; /* total number of items */ + nelm /= nitem; /* number of elements per item */ + + Assert(n <= nitem); /* else it's caller error */ + + /* + * Shuffle array using Fisher-Yates algorithm. Scan the array and swap + * current item (nelm datums starting at ielms) with a randomly chosen + * later item (nelm datums starting at jelms) in each iteration. We can + * stop once we've done n iterations; then first n items are the result. + */ + ielms = elms; + inuls = nuls; + for (int i = 0; i < n; i++) + { + int j = (int) pg_prng_uint64_range(&pg_global_prng_state, i, nitem - 1) * nelm; + Datum *jelms = elms + j; + bool *jnuls = nuls + j; + + /* Swap i'th and j'th items; advance ielms/inuls to next item */ + for (int k = 0; k < nelm; k++) + { + Datum elm = *ielms; + bool nul = *inuls; + + *ielms++ = *jelms; + *inuls++ = *jnuls; + *jelms++ = elm; + *jnuls++ = nul; + } + } + + /* Set up dimensions of the result */ + memcpy(rdims, dims, ndim * sizeof(int)); + memcpy(rlbs, lbs, ndim * sizeof(int)); + rdims[0] = n; + if (!keep_lb) + rlbs[0] = 1; + + result = construct_md_array(elms, nuls, ndim, rdims, rlbs, + elmtyp, elmlen, elmbyval, elmalign); + + pfree(elms); + pfree(nuls); + + return result; +} + +/* + * array_shuffle + * + * Returns an array with the same dimensions as the input array, with its + * first-dimension elements in random order. + */ +Datum +array_shuffle(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *result; + Oid elmtyp; + TypeCacheEntry *typentry; + + /* + * There is no point in shuffling empty arrays or arrays with less than + * two items. + */ + if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2) + PG_RETURN_ARRAYTYPE_P(array); + + elmtyp = ARR_ELEMTYPE(array); + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + + result = array_shuffle_n(array, ARR_DIMS(array)[0], true, elmtyp, typentry); + + PG_RETURN_ARRAYTYPE_P(result); +} + +/* + * array_sample + * + * Returns an array of n randomly chosen first-dimension elements + * from the input array. + */ +Datum +array_sample(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + int n = PG_GETARG_INT32(1); + ArrayType *result; + Oid elmtyp; + TypeCacheEntry *typentry; + int nitem; + + nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0]; + + if (n < 0 || n > nitem) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("sample size must be between 0 and %d", nitem))); + + elmtyp = ARR_ELEMTYPE(array); + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + + result = array_shuffle_n(array, n, false, elmtyp, typentry); + + PG_RETURN_ARRAYTYPE_P(result); +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index af134d2f67..42e881fafb 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202304051 +#define CATALOG_VERSION_NO 202304071 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index f9f2642201..f64bc68276 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1717,6 +1717,12 @@ { oid => '6172', descr => 'remove last N elements of array', proname => 'trim_array', prorettype => 'anyarray', proargtypes => 'anyarray int4', prosrc => 'trim_array' }, +{ oid => '8464', descr => 'shuffle array', + proname => 'array_shuffle', provolatile => 'v', prorettype => 'anyarray', + proargtypes => 'anyarray', prosrc => 'array_shuffle' }, +{ oid => '8465', descr => 'take samples from array', + proname => 'array_sample', provolatile => 'v', prorettype => 'anyarray', + proargtypes => 'anyarray int4', prosrc => 'array_sample' }, { oid => '3816', descr => 'array typanalyze', proname => 'array_typanalyze', provolatile => 's', prorettype => 'bool', proargtypes => 'internal', prosrc => 'array_typanalyze' }, diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index bfaf125187..7064391468 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -2472,3 +2472,57 @@ SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail ERROR: number of elements to trim must be between 0 and 3 SELECT trim_array(ARRAY[]::int[], 1); -- fail ERROR: number of elements to trim must be between 0 and 0 +-- array_shuffle +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}'; + ?column? +---------- + t +(1 row) + +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}'; + ?column? +---------- + t +(1 row) + +SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[])); + array_dims +------------- + [-1:2][2:3] +(1 row) + +SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[])); + array_dims +----------------- + [1:3][1:2][1:2] +(1 row) + +-- array_sample +SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}'; + ?column? +---------- + t +(1 row) + +SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1); + array_length +-------------- + 3 +(1 row) + +SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3)); + array_dims +------------ + [1:3][2:3] +(1 row) + +SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2)); + array_dims +----------------- + [1:2][1:2][1:2] +(1 row) + +SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail +ERROR: sample size must be between 0 and 6 +SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail +ERROR: sample size must be between 0 and 6 diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index 094937ba63..f1375621e0 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -761,3 +761,17 @@ FROM SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail SELECT trim_array(ARRAY[]::int[], 1); -- fail + +-- array_shuffle +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}'; +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}'; +SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[])); +SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[])); + +-- array_sample +SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}'; +SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1); +SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3)); +SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2)); +SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail +SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail