sqlite/ext/misc/percentile.c

/*
** 2013-05-28
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
******************************************************************************
**
** This file contains code to implement the percentile(Y,P) SQL function
** and similar as described below:
**
**   (1)  The percentile(Y,P) function is an aggregate function taking
**        exactly two arguments.
**
**   (2)  If the P argument to percentile(Y,P) is not the same for every
**        row in the aggregate then an error is thrown.  The word "same"
**        in the previous sentence means that the value differ by less
**        than 0.001.
**
**   (3)  If the P argument to percentile(Y,P) evaluates to anything other
**        than a number in the range of 0.0 to 100.0 inclusive then an
**        error is thrown.
**
**   (4)  If any Y argument to percentile(Y,P) evaluates to a value that
**        is not NULL and is not numeric then an error is thrown.
**
**   (5)  If any Y argument to percentile(Y,P) evaluates to plus or minus
**        infinity then an error is thrown.  (SQLite always interprets NaN
**        values as NULL.)
**
**   (6)  Both Y and P in percentile(Y,P) can be arbitrary expressions,
**        including CASE WHEN expressions.
**
**   (7)  The percentile(Y,P) aggregate is able to handle inputs of at least
**        one million (1,000,000) rows.
**
**   (8)  If there are no non-NULL values for Y, then percentile(Y,P)
**        returns NULL.
**
**   (9)  If there is exactly one non-NULL value for Y, the percentile(Y,P)
**        returns the one Y value.
**
**  (10)  If there N non-NULL values of Y where N is two or more and
**        the Y values are ordered from least to greatest and a graph is
**        drawn from 0 to N-1 such that the height of the graph at J is
**        the J-th Y value and such that straight lines are drawn between
**        adjacent Y values, then the percentile(Y,P) function returns
**        the height of the graph at P*(N-1)/100.
**
**  (11)  The percentile(Y,P) function always returns either a floating
**        point number or NULL.
**
**  (12)  The percentile(Y,P) is implemented as a single C99 source-code
**        file that compiles into a shared-library or DLL that can be loaded
**        into SQLite using the sqlite3_load_extension() interface.
**
**  (13)  A separate median(Y) function is the equivalent percentile(Y,50).
**
**  (14)  A separate percentile_cont(Y,P) function is equivalent to
**        percentile(Y,P/100.0).  In other words, the fraction value in
**        the second argument is in the range of 0 to 1 instead of 0 to 100.
**
**  (15)  A separate percentile_disc(Y,P) function is like
**        percentile_cont(Y,P) except that instead of returning the weighted
**        average of the nearest two input values, it returns the next lower
**        value.  So the percentile_disc(Y,P) will always return a value
**        that was one of the inputs.
**
**  (16)  All of median(), percentile(Y,P), percentile_cont(Y,P) and
**        percentile_disc(Y,P) can be used as window functions.
**
** Differences from standard SQL:
**
**  *  The percentile_cont(X,P) function is equivalent to the following in
**     standard SQL:
**
**         (percentile_cont(P) WITHIN GROUP (ORDER BY X))
**
**     The SQLite syntax is much more compact.  The standard SQL syntax
**     is also supported if SQLite is compiled with the
**     -DSQLITE_ENABLE_ORDERED_SET_AGGREGATES option.
**
**  *  No median(X) function exists in the SQL standard.  App developers
**     are expected to write "percentile_cont(0.5)WITHIN GROUP(ORDER BY X)".
**
**  *  No percentile(Y,P) function exists in the SQL standard.  Instead of
**     percential(Y,P), developers must write this:
**     "percentile_cont(P/100.0) WITHIN GROUP (ORDER BY Y)".  Note that
**     the fraction parameter to percentile() goes from 0 to 100 whereas
**     the fraction parameter in SQL standard percentile_cont() goes from
**     0 to 1.
**
** Implementation notes as of 2024-08-31:
**
**  *  The regular aggregate-function versions of these routines work
**     by accumulating all values in an array of doubles, then sorting
**     that array using quicksort before computing the answer. Thus
**     the runtime is O(NlogN) where N is the number of rows of input.
**
**  *  For the window-function versions of these routines, the array of
**     inputs is sorted as soon as the first value is computed.  Thereafter,
**     the array is kept in sorted order using an insert-sort.  This
**     results in O(N*K) performance where K is the size of the window.
**     One can imagine alternative implementations that give O(N*logN*logK)
**     performance, but they require more complex logic and data structures.
**     The developers have elected to keep the asymptotically slower
**     algorithm for now, for simplicity, under the theory that window
**     functions are seldom used and when they are, the window size K is
**     often small.  The developers might revisit that decision later,
**     should the need arise.
*/
#if defined(SQLITE3_H)
  /* no-op */
#elif defined(SQLITE_STATIC_PERCENTILE)
#  include "sqlite3.h"
#else
#  include "sqlite3ext.h"
   SQLITE_EXTENSION_INIT1
#endif
#include <assert.h>
#include <string.h>
#include <stdlib.h>

/* The following object is the group context for a single percentile()
** aggregate.  Remember all input Y values until the very end.
** Those values are accumulated in the Percentile.a[] array.
*/
typedef struct Percentile Percentile;
struct Percentile {
  unsigned nAlloc;     /* Number of slots allocated for a[] */
  unsigned nUsed;      /* Number of slots actually used in a[] */
  char bSorted;        /* True if a[] is already in sorted order */
  char bKeepSorted;    /* True if advantageous to keep a[] sorted */
  char bPctValid;      /* True if rPct is valid */
  double rPct;         /* Fraction.  0.0 to 1.0 */
  double *a;           /* Array of Y values */
};

/* Details of each function in the percentile family */
typedef struct PercentileFunc PercentileFunc;
struct PercentileFunc {
  const char *zName;   /* Function name */
  char nArg;           /* Number of arguments */
  char mxFrac;         /* Maximum value of the "fraction" input */
  char bDiscrete;      /* True for percentile_disc() */
};
static const PercentileFunc aPercentFunc[] = {
  { "median",           1,   1, 0 },
  { "percentile",       2, 100, 0 },
  { "percentile_cont",  2,   1, 0 },
  { "percentile_disc",  2,   1, 1 },
};

/*
** Return TRUE if the input floating-point number is an infinity.
*/
static int percentIsInfinity(double r){
  sqlite3_uint64 u;
  assert( sizeof(u)==sizeof(r) );
  memcpy(&u, &r, sizeof(u));
  return ((u>>52)&0x7ff)==0x7ff;
}

/*
** Return TRUE if two doubles differ by 0.001 or less.
*/
static int percentSameValue(double a, double b){
  a -= b;
  return a>=-0.001 && a<=0.001;
}

/*
** Search p (which must have p->bSorted) looking for an entry with
** value y.  Return the index of that entry.
**
** If bExact is true, return -1 if the entry is not found.
**
** If bExact is false, return the index at which a new entry with
** value y should be insert in order to keep the values in sorted
** order.  The smallest return value in this case will be 0, and
** the largest return value will be p->nUsed.
*/
static int percentBinarySearch(Percentile *p, double y, int bExact){
  int iFirst = 0;              /* First element of search range */
  int iLast = p->nUsed - 1;    /* Last element of search range */
  while( iLast>=iFirst ){
    int iMid = (iFirst+iLast)/2;
    double x = p->a[iMid];
    if( x<y ){
      iFirst = iMid + 1;
    }else if( x>y ){
      iLast = iMid - 1;
    }else{
      return iMid;
    }
  }
  if( bExact ) return -1;
  return iFirst;
}

/*
** Generate an error for a percentile function.
**
** The error format string must have exactly one occurrance of "%%s()"
** (with two '%' characters).  That substring will be replaced by the name
** of the function.
*/
static void percentError(sqlite3_context *pCtx, const char *zFormat, ...){
  PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
  char *zMsg1;
  char *zMsg2;
  va_list ap;

  va_start(ap, zFormat);
  zMsg1 = sqlite3_vmprintf(zFormat, ap);
  va_end(ap);
  zMsg2 = zMsg1 ? sqlite3_mprintf(zMsg1, pFunc->zName) : 0;
  sqlite3_result_error(pCtx, zMsg2, -1);
  sqlite3_free(zMsg1);
  sqlite3_free(zMsg2);
}

/*
** The "step" function for percentile(Y,P) is called once for each
** input row.
*/
static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
  Percentile *p;
  double rPct;
  int eType;
  double y;
  assert( argc==2 || argc==1 );

  if( argc==1 ){
    /* Requirement 13:  median(Y) is the same as percentile(Y,50). */
    rPct = 0.5;
  }else{
    /* Requirement 3:  P must be a number between 0 and 100 */
    PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
    eType = sqlite3_value_numeric_type(argv[1]);
    rPct = sqlite3_value_double(argv[1])/(double)pFunc->mxFrac;
    if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT)
     || rPct<0.0 || rPct>1.0
    ){
      percentError(pCtx, "the fraction argument to %%s()"
                        " is not between 0.0 and %.1f",
                        (double)pFunc->mxFrac);
      return;
    }
  }

  /* Allocate the session context. */
  p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
  if( p==0 ) return;

  /* Remember the P value.  Throw an error if the P value is different
  ** from any prior row, per Requirement (2). */
  if( !p->bPctValid ){
    p->rPct = rPct;
    p->bPctValid = 1;
  }else if( !percentSameValue(p->rPct,rPct) ){
    percentError(pCtx, "the fraction argument to %%s()"
                      " is not the same for all input rows");
    return;
  }

  /* Ignore rows for which Y is NULL */
  eType = sqlite3_value_type(argv[0]);
  if( eType==SQLITE_NULL ) return;

  /* If not NULL, then Y must be numeric.  Otherwise throw an error.
  ** Requirement 4 */
  if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
    percentError(pCtx, "input to %%s() is not numeric");
    return;
  }

  /* Throw an error if the Y value is infinity or NaN */
  y = sqlite3_value_double(argv[0]);
  if( percentIsInfinity(y) ){
    percentError(pCtx, "Inf input to %%s()");
    return;
  }

  /* Allocate and store the Y */
  if( p->nUsed>=p->nAlloc ){
    unsigned n = p->nAlloc*2 + 250;
    double *a = sqlite3_realloc64(p->a, sizeof(double)*n);
    if( a==0 ){
      sqlite3_free(p->a);
      memset(p, 0, sizeof(*p));
      sqlite3_result_error_nomem(pCtx);
      return;
    }
    p->nAlloc = n;
    p->a = a;
  }
  if( p->nUsed==0 ){
    p->a[p->nUsed++] = y;
    p->bSorted = 1;
  }else if( !p->bSorted || y>=p->a[p->nUsed-1] ){
    p->a[p->nUsed++] = y;
  }else if( p->bKeepSorted ){
    int i;
    i = percentBinarySearch(p, y, 0);
    if( i<(int)p->nUsed ){
      memmove(&p->a[i+1], &p->a[i], (p->nUsed-i)*sizeof(p->a[0]));
    }
    p->a[i] = y;
    p->nUsed++;
  }else{
    p->a[p->nUsed++] = y;
    p->bSorted = 0;
  }
}

/*
** Interchange two doubles.
*/
#define SWAP_DOUBLE(X,Y)  {double ttt=(X);(X)=(Y);(Y)=ttt;}

/*
** Sort an array of doubles.
**
** Algorithm: quicksort
**
** This is implemented separately rather than using the qsort() routine
** from the standard library because:
**
**    (1)  To avoid a dependency on qsort()
**    (2)  To avoid the function call to the comparison routine for each
**         comparison.
*/
static void percentSort(double *a, unsigned int n){
  int iLt;  /* Entries before a[iLt] are less than rPivot */
  int iGt;  /* Entries at or after a[iGt] are greater than rPivot */
  int i;         /* Loop counter */
  double rPivot; /* The pivot value */

  assert( n>=2 );
  if( a[0]>a[n-1] ){
    SWAP_DOUBLE(a[0],a[n-1])
  }
  if( n==2 ) return;
  iGt = n-1;
  i = n/2;
  if( a[0]>a[i] ){
    SWAP_DOUBLE(a[0],a[i])
  }else if( a[i]>a[iGt] ){
    SWAP_DOUBLE(a[i],a[iGt])
  }
  if( n==3 ) return;
  rPivot = a[i];
  iLt = i = 1;
  do{
    if( a[i]<rPivot ){
      if( i>iLt ) SWAP_DOUBLE(a[i],a[iLt])
      iLt++;
      i++;
    }else if( a[i]>rPivot ){
      do{
        iGt--;
      }while( iGt>i && a[iGt]>rPivot );
      SWAP_DOUBLE(a[i],a[iGt])
    }else{
      i++;
    }
  }while( i<iGt );
  if( iLt>=2 ) percentSort(a, iLt);
  if( n-iGt>=2 ) percentSort(a+iGt, n-iGt);

/* Uncomment for testing */
#if 0
  for(i=0; i<n-1; i++){
    assert( a[i]<=a[i+1] );
  }
#endif
}


/*
** The "inverse" function for percentile(Y,P) is called to remove a
** row that was previously inserted by "step".
*/
static void percentInverse(sqlite3_context *pCtx,int argc,sqlite3_value **argv){
  Percentile *p;
  int eType;
  double y;
  int i;
  assert( argc==2 || argc==1 );

  /* Allocate the session context. */
  p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
  assert( p!=0 );

  /* Ignore rows for which Y is NULL */
  eType = sqlite3_value_type(argv[0]);
  if( eType==SQLITE_NULL ) return;

  /* If not NULL, then Y must be numeric.  Otherwise throw an error.
  ** Requirement 4 */
  if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
    return;
  }

  /* Ignore the Y value if it is infinity or NaN */
  y = sqlite3_value_double(argv[0]);
  if( percentIsInfinity(y) ){
    return;
  }
  if( p->bSorted==0 ){
    assert( p->nUsed>1 );
    percentSort(p->a, p->nUsed);
    p->bSorted = 1;
  }
  p->bKeepSorted = 1;

  /* Find and remove the row */
  i = percentBinarySearch(p, y, 1);
  if( i>=0 ){
    p->nUsed--;
    if( i<(int)p->nUsed ){
      memmove(&p->a[i], &p->a[i+1], (p->nUsed - i)*sizeof(p->a[0]));
    }
  }
}

/*
** Compute the final output of percentile().  Clean up all allocated
** memory if and only if bIsFinal is true.
*/
static void percentCompute(sqlite3_context *pCtx, int bIsFinal){
  Percentile *p;
  PercentileFunc *pFunc = (PercentileFunc*)sqlite3_user_data(pCtx);
  unsigned i1, i2;
  double v1, v2;
  double ix, vx;
  p = (Percentile*)sqlite3_aggregate_context(pCtx, 0);
  if( p==0 ) return;
  if( p->a==0 ) return;
  if( p->nUsed ){
    if( p->bSorted==0 ){
      assert( p->nUsed>1 );
      percentSort(p->a, p->nUsed);
      p->bSorted = 1;
    }
    ix = p->rPct*(p->nUsed-1);
    i1 = (unsigned)ix;
    if( pFunc->bDiscrete ){
      vx = p->a[i1];
    }else{
      i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
      v1 = p->a[i1];
      v2 = p->a[i2];
      vx = v1 + (v2-v1)*(ix-i1);
    }
    sqlite3_result_double(pCtx, vx);
  }
  if( bIsFinal ){
    sqlite3_free(p->a);
    memset(p, 0, sizeof(*p));
  }else{
    p->bKeepSorted = 1;
  }
}
static void percentFinal(sqlite3_context *pCtx){
  percentCompute(pCtx, 1);
}
static void percentValue(sqlite3_context *pCtx){
  percentCompute(pCtx, 0);
}

#if defined(_WIN32) && !defined(SQLITE3_H) && !defined(SQLITE_STATIC_PERCENTILE)
__declspec(dllexport)
#endif
int sqlite3_percentile_init(
  sqlite3 *db,
  char **pzErrMsg,
  const sqlite3_api_routines *pApi
){
  int rc = SQLITE_OK;
  unsigned int i;
#if defined(SQLITE3_H) || defined(SQLITE_STATIC_PERCENTILE)
  (void)pApi;      /* Unused parameter */
#else
  SQLITE_EXTENSION_INIT2(pApi);
#endif
  (void)pzErrMsg;  /* Unused parameter */
  for(i=0; i<sizeof(aPercentFunc)/sizeof(aPercentFunc[0]); i++){
    rc = sqlite3_create_window_function(db,
            aPercentFunc[i].zName,
            aPercentFunc[i].nArg,
            SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_SELFORDER1,
            (void*)&aPercentFunc[i],
            percentStep, percentFinal, percentValue, percentInverse, 0);
    if( rc ) break;
  }
  return rc;
}