Avoid unnecessary sort operations when running one of the percentile

aggregates as a window function.

FossilOrigin-Name: 5d311536211eb1e3c887ceb7e6516d3900e6eebbccc8c445dd43cdd556182728
This commit is contained in:
drh 2024-08-31 16:55:14 +00:00
parent 129767958a
commit ad8ec9db63
3 changed files with 149 additions and 61 deletions

View File

@ -62,6 +62,29 @@
**
** (14) A separate percentile_cond(Y,X) function is the equivalent of
** percentile(Y,X*100.0).
**
** (15) All three SQL functions implemented by this module can also be
** used as window-functions.
**
** Implementation notes as of 2024-08-31:
**
** * The regular aggregate-function versions of the merge(), percentile(),
** and percentile_cond() routines work by accumulating all values in
** an array of doubles, then sorting that array using a quicksort
** before computing the answer. Thus the runtime is O(NlogN) where
** N is the number of rows of input.
**
** * For the window-function versions of these routines, the array of
** inputs is sorted as soon as the first value is computed. Thereafter,
** the array is kept in sorted order using an insert-sort. This
** results in O(N*K) performance where K is the size of the window.
** One can devise alternative implementations that give O(N*logN*logK)
** performance, but they require more complex logic and data structures.
** The developers have elected to keep the asymptotically slower
** algorithm for now, for simplicity, under the theory that window
** functions are seldom used and when they are, the window size K is
** often small. The developers might revisit that decision later,
** should the need arise.
*/
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1
@ -78,6 +101,7 @@ struct Percentile {
unsigned nAlloc; /* Number of slots allocated for a[] */
unsigned nUsed; /* Number of slots actually used in a[] */
char bSorted; /* True if a[] is already in sorted order */
char bKeepSorted; /* True if advantageous to keep a[] sorted */
double rPct; /* 1.0 more than the value for P */
double *a; /* Array of Y values */
};
@ -85,7 +109,7 @@ struct Percentile {
/*
** Return TRUE if the input floating-point number is an infinity.
*/
static int isInfinity(double r){
static int percentIsInfinity(double r){
sqlite3_uint64 u;
assert( sizeof(u)==sizeof(r) );
memcpy(&u, &r, sizeof(u));
@ -93,13 +117,55 @@ static int isInfinity(double r){
}
/*
** Return TRUE if two doubles differ by 0.001 or less
** Return TRUE if two doubles differ by 0.001 or less.
*/
static int sameValue(double a, double b){
static int percentSameValue(double a, double b){
a -= b;
return a>=-0.001 && a<=0.001;
}
#if 0
/* Verify that the elements of the Percentile p are in fact sorted.
** Used for testing and debugging only.
*/
static void percentAssertSorted(Percentile *p){
int i;
for(i=p->nUsed-2; i>=0 && p->a[i]<=p->a[i+1]; i--){}
assert( i<0 );
}
#else
# define percentAssertSorted(X)
#endif
/*
** Search p (which must have p->bSorted) looking for an entry with
** value y. Return the index of that entry.
**
** If bExact is true, return -1 if the entry is not found.
**
** If bExact is false, return the index at which a new entry with
** value y should be insert in order to keep the values in sorted
** order. The smallest return value in this case will be 0, and
** the largest return value will be p->nUsed.
*/
static int percentBinarySearch(Percentile *p, double y, int bExact){
int iFirst = 0; /* First element of search range */
int iLast = p->nUsed - 1; /* Last element of search range */
while( iLast>=iFirst ){
int iMid = (iFirst+iLast)/2;
double x = p->a[iMid];
if( x<y ){
iFirst = iMid + 1;
}else if( x>y ){
iLast = iMid - 1;
}else{
return iMid;
}
}
if( bExact ) return -1;
return iFirst;
}
/*
** The "step" function for percentile(Y,P) is called once for each
** input row.
@ -145,7 +211,7 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
** from any prior row, per Requirement (2). */
if( p->rPct==0.0 ){
p->rPct = rPct+1.0;
}else if( !sameValue(p->rPct,rPct+1.0) ){
}else if( !percentSameValue(p->rPct,rPct+1.0) ){
sqlite3_result_error(pCtx, "2nd argument to percentile() is not the "
"same for all input rows", -1);
return;
@ -165,7 +231,7 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
/* Throw an error if the Y value is infinity or NaN */
y = sqlite3_value_double(argv[0]);
if( isInfinity(y) ){
if( percentIsInfinity(y) ){
sqlite3_result_error(pCtx, "Inf input to percentile()", -1);
return;
}
@ -183,55 +249,26 @@ static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
p->nAlloc = n;
p->a = a;
}
p->a[p->nUsed++] = y;
assert( p->nUsed>=1 );
if( p->nUsed==1 ){
if( p->nUsed==0 ){
p->a[p->nUsed++] = y;
p->bSorted = 1;
}else if( p->bSorted && p->a[p->nUsed-2]>y ){
}else if( !p->bSorted || y>=p->a[p->nUsed-1] ){
p->a[p->nUsed++] = y;
}else if( p->bKeepSorted ){
int i;
percentAssertSorted(p);
i = percentBinarySearch(p, y, 0);
if( i<p->nUsed ){
memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0]));
}
p->a[i] = y;
p->nUsed++;
}else{
p->a[p->nUsed++] = y;
p->bSorted = 0;
}
}
/*
** The "inverse" function for percentile(Y,P) is called to remove a
** row that was previously inserted by "step".
*/
static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){
Percentile *p;
int eType;
double y;
int i;
assert( argc==2 || argc==1 );
/* Allocate the session context. */
p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
assert( p!=0 );
/* Ignore rows for which Y is NULL */
eType = sqlite3_value_type(argv[0]);
if( eType==SQLITE_NULL ) return;
/* If not NULL, then Y must be numeric. Otherwise throw an error.
** Requirement 4 */
if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
return;
}
/* Ignore the Y value if it is infinity or NaN */
y = sqlite3_value_double(argv[0]);
if( isInfinity(y) ){
return;
}
/* Find and remove the row */
for(i=0; i<p->nUsed && p->a[i]!=y; i++){}
if( i<p->nUsed ){
p->a[i] = p->a[p->nUsed-1];
p->nUsed--;
}
p->bSorted = p->nUsed<=1;
}
/*
** Sort an array of doubles.
**
@ -290,9 +327,59 @@ static void sortDoubles(double *a, int n){
#endif
}
/*
** Called to compute the final output of percentile() and to clean
** up all allocated memory.
** The "inverse" function for percentile(Y,P) is called to remove a
** row that was previously inserted by "step".
*/
static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){
Percentile *p;
int eType;
double y;
int i;
assert( argc==2 || argc==1 );
/* Allocate the session context. */
p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
assert( p!=0 );
/* Ignore rows for which Y is NULL */
eType = sqlite3_value_type(argv[0]);
if( eType==SQLITE_NULL ) return;
/* If not NULL, then Y must be numeric. Otherwise throw an error.
** Requirement 4 */
if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
return;
}
/* Ignore the Y value if it is infinity or NaN */
y = sqlite3_value_double(argv[0]);
if( percentIsInfinity(y) ){
return;
}
if( p->bSorted==0 ){
sortDoubles(p->a, p->nUsed);
p->bSorted = 1;
}else{
percentAssertSorted(p);
}
p->bKeepSorted = 1;
/* Find and remove the row */
i = percentBinarySearch(p, y, 1);
if( i>=0 ){
p->nUsed--;
if( i<p->nUsed ){
memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0]));
}
}
percentAssertSorted(p);
}
/*
** Compute the final output of percentile(). Clean up all allocated
** memory if and only if bIsFinal is true.
*/
static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
Percentile *p;
@ -306,6 +393,8 @@ static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
if( p->bSorted==0 ){
sortDoubles(p->a, p->nUsed);
p->bSorted = 1;
}else{
percentAssertSorted(p);
}
ix = (p->rPct-1.0)*(p->nUsed-1)*0.01;
i1 = (unsigned)ix;
@ -318,6 +407,8 @@ static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
if( bIsFinal ){
sqlite3_free(p->a);
memset(p, 0, sizeof(*p));
}else{
p->bKeepSorted = 1;
}
}
static void percentFinal(sqlite3_context *pCtx){
@ -327,8 +418,6 @@ static void percentValue(sqlite3_context *pCtx){
percentCompute(pCtx, 0);
}
#ifdef _WIN32
__declspec(dllexport)
#endif
@ -349,7 +438,6 @@ int sqlite3_percentile_init(
SQLITE_UTF8|SQLITE_INNOCUOUS, 0,
percentStep, percentFinal,
percentValue, percentInverse, 0);
}
if( rc==SQLITE_OK ){
rc = sqlite3_create_window_function(db, "percentile_cont", 2,

View File

@ -1,5 +1,5 @@
C Test\scases\sadded.
D 2024-08-31T15:02:07.805
C Avoid\sunnecessary\ssort\soperations\swhen\srunning\sone\sof\sthe\spercentile\naggregates\sas\sa\swindow\sfunction.
D 2024-08-31T16:55:14.747
F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724
@ -410,7 +410,7 @@ F ext/misc/nextchar.c 7877914c2a80c2f181dd04c3dbef550dfb54c93495dc03da2403b5dd58
F ext/misc/noop.c f1a21cc9b7a4e667e5c8458d80ba680b8bd4315a003f256006046879f679c5a0
F ext/misc/normalize.c bd84355c118e297522aba74de34a4fd286fc775524e0499b14473918d09ea61f
F ext/misc/pcachetrace.c f4227ce03fb16aa8d6f321b72dd051097419d7a028a9853af048bee7645cb405
F ext/misc/percentile.c 89416b108569171be1d8dda4fa2687ad116ea969b4d129d02cf3dc1fd67fc87e
F ext/misc/percentile.c 46627b7495c69344d384f667bb6c80ba2c4aeb779997a4e22fea1a39cd20beb9
F ext/misc/prefixes.c 82645f79229877afab08c8b08ca1e7fa31921280906b90a61c294e4f540cd2a6
F ext/misc/qpvtab.c fc189e127f68f791af90a487f4460ec91539a716daf45a0c357e963fd47cc06c
F ext/misc/randomjson.c ef835fc64289e76ac4873b85fe12f9463a036168d7683cf2b773e36e6262c4ed
@ -2211,8 +2211,8 @@ F vsixtest/vsixtest.tcl 6195aba1f12a5e10efc2b8c0009532167be5e301abe5b31385638080
F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
P 4d0e3df4b9c609755977b8a462126242d2be1310c0122a8d4ba76d98d32a7230
R 06b03a21c6b126e146ea802b7b43139b
P 25e68229843cc84978955817285550085d1306ba4ce3b0517dd00e5d05b9ae0a
R 1b58d4d1990506055e066a3b56b6ff9a
U drh
Z a8a75faed57aa0d8766eb4f3377323de
Z 10ad6a710202821993e7562cc60a54b3
# Remove this line to create a well-formed Fossil manifest.

View File

@ -1 +1 @@
25e68229843cc84978955817285550085d1306ba4ce3b0517dd00e5d05b9ae0a
5d311536211eb1e3c887ceb7e6516d3900e6eebbccc8c445dd43cdd556182728