sqlite/ext/misc/percentile.c
drh 171c944345 Enhance the percentile() extension function to include the median()
variant.  Update the implementation to implement its own sorting
algorithm, so that the extension no longer depends on qsort().

FossilOrigin-Name: 6e31b1bab1f014933c671a12a5930c1e88d611cfe68d389e9ce888bd8842addd
2024-07-23 16:23:46 +00:00

271 lines
7.9 KiB
C

/*
** 2013-05-28
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
** This file contains code to implement the percentile(Y,P) SQL function
** as described below:
**
** (1) The percentile(Y,P) function is an aggregate function taking
** exactly two arguments.
**
** (2) If the P argument to percentile(Y,P) is not the same for every
** row in the aggregate then an error is thrown. The word "same"
** in the previous sentence means that the value differ by less
** than 0.001.
**
** (3) If the P argument to percentile(Y,P) evaluates to anything other
** than a number in the range of 0.0 to 100.0 inclusive then an
** error is thrown.
**
** (4) If any Y argument to percentile(Y,P) evaluates to a value that
** is not NULL and is not numeric then an error is thrown.
**
** (5) If any Y argument to percentile(Y,P) evaluates to plus or minus
** infinity then an error is thrown. (SQLite always interprets NaN
** values as NULL.)
**
** (6) Both Y and P in percentile(Y,P) can be arbitrary expressions,
** including CASE WHEN expressions.
**
** (7) The percentile(Y,P) aggregate is able to handle inputs of at least
** one million (1,000,000) rows.
**
** (8) If there are no non-NULL values for Y, then percentile(Y,P)
** returns NULL.
**
** (9) If there is exactly one non-NULL value for Y, the percentile(Y,P)
** returns the one Y value.
**
** (10) If there N non-NULL values of Y where N is two or more and
** the Y values are ordered from least to greatest and a graph is
** drawn from 0 to N-1 such that the height of the graph at J is
** the J-th Y value and such that straight lines are drawn between
** adjacent Y values, then the percentile(Y,P) function returns
** the height of the graph at P*(N-1)/100.
**
** (11) The percentile(Y,P) function always returns either a floating
** point number or NULL.
**
** (12) The percentile(Y,P) is implemented as a single C99 source-code
** file that compiles into a shared-library or DLL that can be loaded
** into SQLite using the sqlite3_load_extension() interface.
**
** (13) A separate median(Y) function is the equivalent percentile(Y,50).
*/
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1
#include <assert.h>
#include <string.h>
#include <stdlib.h>
/* The following object is the session context for a single percentile()
** function. We have to remember all input Y values until the very end.
** Those values are accumulated in the Percentile.a[] array.
*/
typedef struct Percentile Percentile;
struct Percentile {
unsigned nAlloc; /* Number of slots allocated for a[] */
unsigned nUsed; /* Number of slots actually used in a[] */
double rPct; /* 1.0 more than the value for P */
double *a; /* Array of Y values */
};
/*
** Return TRUE if the input floating-point number is an infinity.
*/
static int isInfinity(double r){
sqlite3_uint64 u;
assert( sizeof(u)==sizeof(r) );
memcpy(&u, &r, sizeof(u));
return ((u>>52)&0x7ff)==0x7ff;
}
/*
** Return TRUE if two doubles differ by 0.001 or less
*/
static int sameValue(double a, double b){
a -= b;
return a>=-0.001 && a<=0.001;
}
/*
** The "step" function for percentile(Y,P) is called once for each
** input row.
*/
static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
Percentile *p;
double rPct;
int eType;
double y;
assert( argc==2 || argc==1 );
if( argc==1 ){
/* Requirement 13: median(Y) is the same as percentile(Y,50). */
rPct = 50.0;
}else{
/* Requirement 3: P must be a number between 0 and 100 */
eType = sqlite3_value_numeric_type(argv[1]);
rPct = sqlite3_value_double(argv[1]);
if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT)
|| rPct<0.0 || rPct>100.0 ){
sqlite3_result_error(pCtx, "2nd argument to percentile() is not "
"a number between 0.0 and 100.0", -1);
return;
}
}
/* Allocate the session context. */
p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
if( p==0 ) return;
/* Remember the P value. Throw an error if the P value is different
** from any prior row, per Requirement (2). */
if( p->rPct==0.0 ){
p->rPct = rPct+1.0;
}else if( !sameValue(p->rPct,rPct+1.0) ){
sqlite3_result_error(pCtx, "2nd argument to percentile() is not the "
"same for all input rows", -1);
return;
}
/* Ignore rows for which Y is NULL */
eType = sqlite3_value_type(argv[0]);
if( eType==SQLITE_NULL ) return;
/* If not NULL, then Y must be numeric. Otherwise throw an error.
** Requirement 4 */
if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
sqlite3_result_error(pCtx, "1st argument to percentile() is not "
"numeric", -1);
return;
}
/* Throw an error if the Y value is infinity or NaN */
y = sqlite3_value_double(argv[0]);
if( isInfinity(y) ){
sqlite3_result_error(pCtx, "Inf input to percentile()", -1);
return;
}
/* Allocate and store the Y */
if( p->nUsed>=p->nAlloc ){
unsigned n = p->nAlloc*2 + 250;
double *a = sqlite3_realloc64(p->a, sizeof(double)*n);
if( a==0 ){
sqlite3_free(p->a);
memset(p, 0, sizeof(*p));
sqlite3_result_error_nomem(pCtx);
return;
}
p->nAlloc = n;
p->a = a;
}
p->a[p->nUsed++] = y;
}
/*
** Sort an array of doubles.
*/
static void sortDoubles(double *a, int n){
int iLt; /* Entries with index less than iLt are less than rPivot */
int iGt; /* Entries with index iGt or more are greater than rPivot */
int i; /* Loop counter */
double rPivot; /* The pivot value */
double rTmp; /* Temporary used to swap two values */
if( n<2 ) return;
if( n>5 ){
rPivot = (a[0] + a[n/2] + a[n-1])/3.0;
}else{
rPivot = a[n/2];
}
iLt = i = 0;
iGt = n;
while( i<iGt ){
if( a[i]<rPivot ){
if( i>iLt ){
rTmp = a[i];
a[i] = a[iLt];
a[iLt] = rTmp;
}
iLt++;
i++;
}else if( a[i]>rPivot ){
do{
iGt--;
}while( iGt>i && a[iGt]>rPivot );
rTmp = a[i];
a[i] = a[iGt];
a[iGt] = rTmp;
}else{
i++;
}
}
if( iLt>=2 ) sortDoubles(a, iLt);
if( n-iGt>=2 ) sortDoubles(a+iGt, n-iGt);
/* Uncomment for testing */
#if 0
for(i=0; i<n-1; i++){
assert( a[i]<=a[i+1] );
}
#endif
}
/*
** Called to compute the final output of percentile() and to clean
** up all allocated memory.
*/
static void percentFinal(sqlite3_context *pCtx){
Percentile *p;
unsigned i1, i2;
double v1, v2;
double ix, vx;
p = (Percentile*)sqlite3_aggregate_context(pCtx, 0);
if( p==0 ) return;
if( p->a==0 ) return;
if( p->nUsed ){
sortDoubles(p->a, p->nUsed);
ix = (p->rPct-1.0)*(p->nUsed-1)*0.01;
i1 = (unsigned)ix;
i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
v1 = p->a[i1];
v2 = p->a[i2];
vx = v1 + (v2-v1)*(ix-i1);
sqlite3_result_double(pCtx, vx);
}
sqlite3_free(p->a);
memset(p, 0, sizeof(*p));
}
#ifdef _WIN32
__declspec(dllexport)
#endif
int sqlite3_percentile_init(
sqlite3 *db,
char **pzErrMsg,
const sqlite3_api_routines *pApi
){
int rc = SQLITE_OK;
SQLITE_EXTENSION_INIT2(pApi);
(void)pzErrMsg; /* Unused parameter */
rc = sqlite3_create_function(db, "percentile", 2,
SQLITE_UTF8|SQLITE_INNOCUOUS, 0,
0, percentStep, percentFinal);
if( rc==SQLITE_OK ){
rc = sqlite3_create_function(db, "median", 1,
SQLITE_UTF8|SQLITE_INNOCUOUS, 0,
0, percentStep, percentFinal);
}
return rc;
}