First code for an auxiliary function that will normalize an SQL statement.

FossilOrigin-Name: 84814aac81e54f03430f180926156ab0fc01e22bbce2bb228b698ea0d4a588ea
2018-01-08 16:54:26 +00:00 · 2018-01-08 16:54:26 +00:00 · a41f2855e9
commit a41f2855e9
parent a5da4ef4ad
3 changed files with 649 additions and 6 deletions
--- a/ext/misc/normalize.c
+++ b/ext/misc/normalize.c
@ -0,0 +1,639 @@
+/*
+** 2018-01-08
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+** This file contains code to implement the sqlite3_normalize() function.
+**
+**    char *sqlite3_normalize(const char *zSql);
+**
+** This function takes an SQL string as input and returns a "normalized"
+** version of that string in memory obtained from sqlite3_malloc64().  The
+** caller is responsible for ensuring that the returned memory is freed.
+**
+** If a memory allocation error occurs, this routine returns NULL.
+**
+** The normalization consists of the following transformations:
+**
+**   (1)  Convert every literal (string, blob literal, numeric constant,
+**        or "NULL" constant) into a ?
+**
+**   (2)  Remove all superfluous whitespace, including comments.  Change
+**        all required whitespace to a single space character.
+**
+**   (3)  Lowercase all ASCII characters.
+**
+**   (4)  If an IN or NOT IN operator is followed by a list of 1 or more
+**        values, convert that list into "(?,?,?)".
+**
+** The purpose of normalization is two-fold:
+**
+**   (1)  Sanitize queries by removing possibly sensitive information contained
+**        in literals.
+**
+**   (2)  Identify structurally identical queries by comparing their
+**        normalized forms.
+*/
+#include <sqlite3.h>
+#include <string.h>
+
+/*
+** Implementation note:
+**
+** Much of the tokenizer logic is copied out of the tokenize.c source file
+** of SQLite.  This logic could be simplified for this particular application,
+** but that would impose a risk of introducing subtle errors.  It is best to
+** keep the code as close to the original as possible.
+*/
+
+
+/* Character classes for tokenizing
+**
+** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
+** using a lookup table, whereas a switch() directly on c uses a binary search.
+** The lookup table is much faster.  To maximize speed, and to ensure that
+** a lookup table is used, all of the classes need to be small integers and
+** all of them need to be used within the switch.
+*/
+#define CC_X          0    /* The letter 'x', or start of BLOB literal */
+#define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
+#define CC_ID         2    /* unicode characters usable in IDs */
+#define CC_DIGIT      3    /* Digits */
+#define CC_DOLLAR     4    /* '$' */
+#define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
+#define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
+#define CC_SPACE      7    /* Space characters */
+#define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
+#define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
+#define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
+#define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
+#define CC_LT        12    /* '<'.  Part of < or <= or <> */
+#define CC_GT        13    /* '>'.  Part of > or >= */
+#define CC_EQ        14    /* '='.  Part of = or == */
+#define CC_BANG      15    /* '!'.  Part of != */
+#define CC_SLASH     16    /* '/'.  / or c-style comment */
+#define CC_LP        17    /* '(' */
+#define CC_RP        18    /* ')' */
+#define CC_SEMI      19    /* ';' */
+#define CC_PLUS      20    /* '+' */
+#define CC_STAR      21    /* '*' */
+#define CC_PERCENT   22    /* '%' */
+#define CC_COMMA     23    /* ',' */
+#define CC_AND       24    /* '&' */
+#define CC_TILDA     25    /* '~' */
+#define CC_DOT       26    /* '.' */
+#define CC_ILLEGAL   27    /* Illegal character */
+
+static const unsigned char aiClass[] = {
+/*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
+/* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
+/* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+/* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
+/* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
+/* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+/* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
+/* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+/* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
+/* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+/* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+/* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+/* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+/* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+/* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+/* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+/* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
+};
+
+/* An array to map all upper-case characters into their corresponding
+** lower-case character. 
+**
+** SQLite only considers US-ASCII (or EBCDIC) characters.  We do not
+** handle case conversions for the UTF character set since the tables
+** involved are nearly as big or bigger than SQLite itself.
+*/
+const unsigned char sqlite3UpperToLower[] = {
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
+     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+     36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+     54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
+    104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
+    122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
+    108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
+    126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
+    144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
+    162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
+    180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
+    198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
+    216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
+    234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
+    252,253,254,255
+};
+
+/*
+** The following 256 byte lookup table is used to support SQLites built-in
+** equivalents to the following standard library functions:
+**
+**   isspace()                        0x01
+**   isalpha()                        0x02
+**   isdigit()                        0x04
+**   isalnum()                        0x06
+**   isxdigit()                       0x08
+**   toupper()                        0x20
+**   SQLite identifier character      0x40
+**   Quote character                  0x80
+**
+** Bit 0x20 is set if the mapped character requires translation to upper
+** case. i.e. if the character is a lower-case ASCII character.
+** If x is a lower-case ASCII character, then its upper-case equivalent
+** is (x - 0x20). Therefore toupper() can be implemented as:
+**
+**   (x & ~(map[x]&0x20))
+**
+** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
+** array. tolower() is used more often than toupper() by SQLite.
+**
+** Bit 0x40 is set if the character is non-alphanumeric and can be used in an 
+** SQLite identifier.  Identifiers are alphanumerics, "_", "$", and any
+** non-ASCII UTF character. Hence the test for whether or not a character is
+** part of an identifier is 0x46.
+*/
+const unsigned char sqlite3CtypeMap[256] = {
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 00..07    ........ */
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,  /* 08..0f    ........ */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 10..17    ........ */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 18..1f    ........ */
+  0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80,  /* 20..27     !"#$%&' */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 28..2f    ()*+,-./ */
+  0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,  /* 30..37    01234567 */
+  0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 38..3f    89:;<=>? */
+
+  0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02,  /* 40..47    @ABCDEFG */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 48..4f    HIJKLMNO */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 50..57    PQRSTUVW */
+  0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40,  /* 58..5f    XYZ[\]^_ */
+  0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22,  /* 60..67    `abcdefg */
+  0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 68..6f    hijklmno */
+  0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 70..77    pqrstuvw */
+  0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 78..7f    xyz{|}~. */
+
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 80..87    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 88..8f    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 90..97    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 98..9f    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a0..a7    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a8..af    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b0..b7    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b8..bf    ........ */
+
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c0..c7    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c8..cf    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d0..d7    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d8..df    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e0..e7    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e8..ef    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* f0..f7    ........ */
+  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40   /* f8..ff    ........ */
+};
+#define sqlite3Toupper(x)   ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
+#define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
+#define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
+#define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
+#define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
+#define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
+#define sqlite3Tolower(x)   (sqlite3UpperToLower[(unsigned char)(x)])
+#define sqlite3Isquote(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
+
+
+/*
+** If X is a character that can be used in an identifier then
+** IdChar(X) will be true.  Otherwise it is false.
+**
+** For ASCII, any character with the high-order bit set is
+** allowed in an identifier.  For 7-bit characters, 
+** sqlite3IsIdChar[X] must be 1.
+**
+** For EBCDIC, the rules are more complex but have the same
+** end result.
+**
+** Ticket #1066.  the SQL standard does not allow '$' in the
+** middle of identifiers.  But many SQL implementations do. 
+** SQLite will allow '$' in identifiers for compatibility.
+** But the feature is undocumented.
+*/
+#define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
+
+/*
+** Ignore testcase() macros
+*/
+#define testcase(X)
+
+/*
+** Token values
+*/
+#define TK_SPACE    0
+#define TK_NAME     1
+#define TK_LITERAL  2
+#define TK_PUNCT    3
+#define TK_ERROR    4
+
+#define TK_MINUS    TK_PUNCT
+#define TK_LP       TK_PUNCT
+#define TK_RP       TK_PUNCT
+#define TK_SEMI     TK_PUNCT
+#define TK_PLUS     TK_PUNCT
+#define TK_STAR     TK_PUNCT
+#define TK_SLASH    TK_PUNCT
+#define TK_REM      TK_PUNCT
+#define TK_EQ       TK_PUNCT
+#define TK_LE       TK_PUNCT
+#define TK_NE       TK_PUNCT
+#define TK_LSHIFT   TK_PUNCT
+#define TK_LT       TK_PUNCT
+#define TK_GE       TK_PUNCT
+#define TK_RSHIFT   TK_PUNCT
+#define TK_GT       TK_PUNCT
+#define TK_GE       TK_PUNCT
+#define TK_BITOR    TK_PUNCT
+#define TK_CONCAT   TK_PUNCT
+#define TK_COMMA    TK_PUNCT
+#define TK_BITAND   TK_PUNCT
+#define TK_BITNOT   TK_PUNCT
+#define TK_STRING   TK_LITERAL
+#define TK_ID       TK_NAME
+#define TK_ILLEGAL  TK_ERROR
+#define TK_DOT      TK_PUNCT
+#define TK_INTEGER  TK_LITERAL
+#define TK_FLOAT    TK_LITERAL
+#define TK_VARIABLE TK_LITERAL
+#define TK_BLOB     TK_LITERAL
+
+/*
+** Return the length (in bytes) of the token that begins at z[0]. 
+** Store the token type in *tokenType before returning.
+*/
+static int sqlite3GetToken(const unsigned char *z, int *tokenType){
+  int i, c;
+  switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
+                          ** of the token. See the comment on the CC_ defines
+                          ** above. */
+    case CC_SPACE: {
+      for(i=1; sqlite3Isspace(z[i]); i++){}
+      *tokenType = TK_SPACE;
+      return i;
+    }
+    case CC_MINUS: {
+      if( z[1]=='-' ){
+        for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
+        *tokenType = TK_SPACE;
+        return i;
+      }
+      *tokenType = TK_MINUS;
+      return 1;
+    }
+    case CC_LP: {
+      *tokenType = TK_LP;
+      return 1;
+    }
+    case CC_RP: {
+      *tokenType = TK_RP;
+      return 1;
+    }
+    case CC_SEMI: {
+      *tokenType = TK_SEMI;
+      return 1;
+    }
+    case CC_PLUS: {
+      *tokenType = TK_PLUS;
+      return 1;
+    }
+    case CC_STAR: {
+      *tokenType = TK_STAR;
+      return 1;
+    }
+    case CC_SLASH: {
+      if( z[1]!='*' || z[2]==0 ){
+        *tokenType = TK_SLASH;
+        return 1;
+      }
+      for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
+      if( c ) i++;
+      *tokenType = TK_SPACE;
+      return i;
+    }
+    case CC_PERCENT: {
+      *tokenType = TK_REM;
+      return 1;
+    }
+    case CC_EQ: {
+      *tokenType = TK_EQ;
+      return 1 + (z[1]=='=');
+    }
+    case CC_LT: {
+      if( (c=z[1])=='=' ){
+        *tokenType = TK_LE;
+        return 2;
+      }else if( c=='>' ){
+        *tokenType = TK_NE;
+        return 2;
+      }else if( c=='<' ){
+        *tokenType = TK_LSHIFT;
+        return 2;
+      }else{
+        *tokenType = TK_LT;
+        return 1;
+      }
+    }
+    case CC_GT: {
+      if( (c=z[1])=='=' ){
+        *tokenType = TK_GE;
+        return 2;
+      }else if( c=='>' ){
+        *tokenType = TK_RSHIFT;
+        return 2;
+      }else{
+        *tokenType = TK_GT;
+        return 1;
+      }
+    }
+    case CC_BANG: {
+      if( z[1]!='=' ){
+        *tokenType = TK_ILLEGAL;
+        return 1;
+      }else{
+        *tokenType = TK_NE;
+        return 2;
+      }
+    }
+    case CC_PIPE: {
+      if( z[1]!='|' ){
+        *tokenType = TK_BITOR;
+        return 1;
+      }else{
+        *tokenType = TK_CONCAT;
+        return 2;
+      }
+    }
+    case CC_COMMA: {
+      *tokenType = TK_COMMA;
+      return 1;
+    }
+    case CC_AND: {
+      *tokenType = TK_BITAND;
+      return 1;
+    }
+    case CC_TILDA: {
+      *tokenType = TK_BITNOT;
+      return 1;
+    }
+    case CC_QUOTE: {
+      int delim = z[0];
+      testcase( delim=='`' );
+      testcase( delim=='\'' );
+      testcase( delim=='"' );
+      for(i=1; (c=z[i])!=0; i++){
+        if( c==delim ){
+          if( z[i+1]==delim ){
+            i++;
+          }else{
+            break;
+          }
+        }
+      }
+      if( c=='\'' ){
+        *tokenType = TK_STRING;
+        return i+1;
+      }else if( c!=0 ){
+        *tokenType = TK_ID;
+        return i+1;
+      }else{
+        *tokenType = TK_ILLEGAL;
+        return i;
+      }
+    }
+    case CC_DOT: {
+      if( !sqlite3Isdigit(z[1]) ){
+        *tokenType = TK_DOT;
+        return 1;
+      }
+      /* If the next character is a digit, this is a floating point
+      ** number that begins with ".".  Fall thru into the next case */
+    }
+    case CC_DIGIT: {
+      *tokenType = TK_INTEGER;
+      if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
+        for(i=3; sqlite3Isxdigit(z[i]); i++){}
+        return i;
+      }
+      for(i=0; sqlite3Isdigit(z[i]); i++){}
+      if( z[i]=='.' ){
+        i++;
+        while( sqlite3Isdigit(z[i]) ){ i++; }
+        *tokenType = TK_FLOAT;
+      }
+      if( (z[i]=='e' || z[i]=='E') &&
+           ( sqlite3Isdigit(z[i+1]) 
+            || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
+           )
+      ){
+        i += 2;
+        while( sqlite3Isdigit(z[i]) ){ i++; }
+        *tokenType = TK_FLOAT;
+      }
+      while( IdChar(z[i]) ){
+        *tokenType = TK_ILLEGAL;
+        i++;
+      }
+      return i;
+    }
+    case CC_QUOTE2: {
+      for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
+      *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
+      return i;
+    }
+    case CC_VARNUM: {
+      *tokenType = TK_VARIABLE;
+      for(i=1; sqlite3Isdigit(z[i]); i++){}
+      return i;
+    }
+    case CC_DOLLAR:
+    case CC_VARALPHA: {
+      int n = 0;
+      testcase( z[0]=='$' );  testcase( z[0]=='@' );
+      testcase( z[0]==':' );  testcase( z[0]=='#' );
+      *tokenType = TK_VARIABLE;
+      for(i=1; (c=z[i])!=0; i++){
+        if( IdChar(c) ){
+          n++;
+        }else if( c=='(' && n>0 ){
+          do{
+            i++;
+          }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
+          if( c==')' ){
+            i++;
+          }else{
+            *tokenType = TK_ILLEGAL;
+          }
+          break;
+        }else if( c==':' && z[i+1]==':' ){
+          i++;
+        }else{
+          break;
+        }
+      }
+      if( n==0 ) *tokenType = TK_ILLEGAL;
+      return i;
+    }
+    case CC_KYWD: {
+      for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
+      if( IdChar(z[i]) ){
+        /* This token started out using characters that can appear in keywords,
+        ** but z[i] is a character not allowed within keywords, so this must
+        ** be an identifier instead */
+        i++;
+        break;
+      }
+      *tokenType = TK_ID;
+      return i;
+    }
+    case CC_X: {
+      testcase( z[0]=='x' ); testcase( z[0]=='X' );
+      if( z[1]=='\'' ){
+        *tokenType = TK_BLOB;
+        for(i=2; sqlite3Isxdigit(z[i]); i++){}
+        if( z[i]!='\'' || i%2 ){
+          *tokenType = TK_ILLEGAL;
+          while( z[i] && z[i]!='\'' ){ i++; }
+        }
+        if( z[i] ) i++;
+        return i;
+      }
+      /* If it is not a BLOB literal, then it must be an ID, since no
+      ** SQL keywords start with the letter 'x'.  Fall through */
+    }
+    case CC_ID: {
+      i = 1;
+      break;
+    }
+    default: {
+      *tokenType = TK_ILLEGAL;
+      return 1;
+    }
+  }
+  while( IdChar(z[i]) ){ i++; }
+  *tokenType = TK_ID;
+  return i;
+}
+
+char *sqlite3_normalize(const char *zSql){
+  char *z;              /* The output string */
+  sqlite3_int64 nZ;     /* Size of the output string in bytes */
+  sqlite3_int64 nSql;   /* Size of the input string in bytes */
+  int i;                /* Next character to read from zSql[] */
+  int j;                /* Next slot to fill in on z[] */
+  int tokenType;        /* Type of the next token */
+  int n;                /* Size of the next token */
+  int k;                /* Loop counter */
+
+  nSql = strlen(zSql);
+  nZ = nSql;
+  z = sqlite3_malloc64( nZ+2 );
+  if( z==0 ) return 0;
+  for(i=j=0; zSql[i]; i += n){
+    n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
+    switch( tokenType ){
+      case TK_SPACE: {
+        break;
+      }
+      case TK_ERROR: {
+        sqlite3_free(z);
+        return 0;
+      }
+      case TK_LITERAL: {
+        z[j++] = '?';
+        break;
+      }
+      case TK_PUNCT:
+      case TK_NAME: {
+        if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
+        for(k=0; k<n; k++){
+          z[j++] = sqlite3Tolower(zSql[i+k]);
+        }
+        break;
+      }
+    }
+  }
+  while( j>0 && z[j-1]==' ' ){ j--; }
+  if( i>0 && z[j-1]!=';' ){ z[j++] = ';'; }
+  z[j] = 0;
+  return z;
+}
+
+#ifdef NORMALIZE_TEST
+#include <stdio.h>
+#include <stdlib.h>
+
+static void normalizeFile(char *zIn){
+  int i;
+  if( zIn==0 ) return;
+  for(i=0; zIn[i]; i++){
+    char cSaved;
+    if( zIn[i]!=';' ) continue;
+    cSaved = zIn[i+1];
+    zIn[i+1] = 0;
+    if( sqlite3_complete(zIn) ){
+      char *zOut = sqlite3_normalize(zIn);
+      if( zOut ){
+        printf("%s\n", zOut);
+        sqlite3_free(zOut);
+      }else{
+        fprintf(stderr, "ERROR: %s\n", zIn);
+      }
+      zIn[i+1] = cSaved;
+      zIn += i+1;
+      i = -1;
+    }else{
+      zIn[i+1] = cSaved;
+    }
+  }
+}
+
+int main(int argc, char **argv){
+  int i;
+  FILE *in;
+  char *zBuf = 0;
+  sqlite3_int64 sz, got;
+
+  for(i=1; i<argc; i++){
+    in = fopen(argv[i], "rb");
+    if( in==0 ){
+      fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
+      continue;
+    }
+    fseek(in, 0, SEEK_END);
+    sz = ftell(in);
+    rewind(in);
+    zBuf = sqlite3_realloc64(zBuf, sz+1);
+    if( zBuf==0 ){
+      fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
+      exit(1);
+    }
+    got = fread(zBuf, 1, sz, in);
+    fclose(in);
+    if( got!=sz ){
+      fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
+              got, sz, argv[i]);
+    }else{
+      zBuf[got] = 0;
+      normalizeFile(zBuf);
+    }
+  }
+  sqlite3_free(zBuf);
+}
+#endif /* NORMALIZE_TEST */
--- a/14
+++ b/14
@ -1,5 +1,5 @@
-C Avoid\sthe\suse\sof\sutimensat()\son\solder\sunix\splatforms.
-D 2018-01-07T23:28:10.983
+C First\scode\sfor\san\sauxiliary\sfunction\sthat\swill\snormalize\san\sSQL\sstatement.
+D 2018-01-08T16:54:26.258
 F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
 F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
 F Makefile.in 12b6daa4bdb03fa87da27cbc205ff88ace645475b5be79414a3038b68ade14cb
@ -284,6 +284,7 @@ F ext/misc/json1.c dbe086615b9546c156bf32b9378fc09383b58bd17513b866cfd24c1e15281
 F ext/misc/memvfs.c ab36f49e02ebcdf85a1e08dc4d8599ea8f343e073ac9e0bca18a98b7e1ec9567
 F ext/misc/mmapwarm.c 70b618f2d0bde43fae288ad0b7498a629f2b6f61b50a27e06fae3cd23c83af29
 F ext/misc/nextchar.c 35c8b8baacb96d92abbb34a83a997b797075b342
+F ext/misc/normalize.c 5ca65b1332e41d679d7c07374398b49d020281d4ea30f267c6a6fb500e71f7c7
 F ext/misc/percentile.c 92699c8cd7d517ff610e6037e56506f8904dae2e
 F ext/misc/regexp.c a68d25c659bd2d893cd1215667bbf75ecb9dc7d4
 F ext/misc/remember.c add730f0f7e7436cd15ea3fd6a90fd83c3f706ab44169f7f048438b7d6baa69c
@ -1697,7 +1698,10 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93
 F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
 F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
 F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
-P 0a50c9e3bb0dbdaaec819ac6453276ba287b475ea322918ddda1ab3a1ec4b58b
-R 1b4834f0a49c166cef0f6faa748094c7
+P 90cb01d8d6ac12d0b88f2952a75aeefa81ba66f5e4a5377fdd8b9f86aec8e927
+R 9bd921419b4f2c3b5e55f041d45b29ce
+T *branch * normalize
+T *sym-normalize *
+T -sym-trunk *
 U drh
-Z 0323f752d28821f22e44c20f1850c6dd
+Z c8d4afe60d21d90fa2f1cb97ff587c09
--- a/manifest.uuid
+++ b/manifest.uuid
@ -1 +1 @@
-90cb01d8d6ac12d0b88f2952a75aeefa81ba66f5e4a5377fdd8b9f86aec8e927
+84814aac81e54f03430f180926156ab0fc01e22bbce2bb228b698ea0d4a588ea