Rework the UTF8 reader logic in order to avoid the use of malloc().

Ticket #2523. (CVS 4175) FossilOrigin-Name: 9a059cb6bced5cdc950f7816602ac92d89a899be
2007-07-23 19:12:41 +00:00 · 2007-07-23 19:12:41 +00:00 · 6615095629
parent ad6b3159be
commit 6615095629
5 changed files with 134 additions and 195 deletions
--- a/16
+++ b/16
@ -1,5 +1,5 @@
-C Fix\sa\sbad\ssizeof\sin\svdbe.c.\s\sTicket\s#2522.\s(CVS\s4174)
-D 2007-07-22T19:10:21
+C Rework\sthe\sUTF8\sreader\slogic\sin\sorder\sto\savoid\sthe\suse\sof\smalloc().\nTicket\s#2523.\s(CVS\s4175)
+D 2007-07-23T19:12:42
 F Makefile.in 0c0e53720f658c7a551046442dd7afba0b72bfbe
 F Makefile.linux-gcc 65241babba6faf1152bf86574477baab19190499
 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -78,7 +78,7 @@ F src/date.c 6049db7d5a8fdf2c677ff7d58fa31d4f6593c988
 F src/delete.c 5c0d89b3ef7d48fe1f5124bfe8341f982747fe29
 F src/experimental.c 1b2d1a6cd62ecc39610e97670332ca073c50792b
 F src/expr.c de9f55b1baed00199466028ad96967208d487798
-F src/func.c 6b45261aa2c514f642201b90493af68469c04af6
+F src/func.c dcba54fc18d2b2fd02f8b7c3dc13e27d100a4d8e
 F src/hash.c 67b23e14f0257b69a3e8aa663e4eeadc1a2b6fd5
 F src/hash.h 1b3f7e2609141fd571f62199fc38687d262e9564
 F src/insert.c 89d184422d85db0418e0f66032ccea3657078ecd
@ -111,7 +111,7 @@ F src/server.c 087b92a39d883e3fa113cae259d64e4c7438bc96
 F src/shell.c e7534cce78398bc1cac4a643e931fc6221c2897e
 F src/sqlite.h.in 8164526b1658a6dad472953ea91239849f913d45
 F src/sqlite3ext.h a27bedc222df5e5f0f458ac99726d0483b953a91
-F src/sqliteInt.h 81183ae71162818bf60478e738ff68604128bb06
+F src/sqliteInt.h 358f3a29b98e1efdd840a928dec8f60a51e6a33e
 F src/sqliteLimit.h f14609c27636ebc217c9603ade26dbdd7d0f6afa
 F src/table.c a8de75bcedf84d4060d804264b067ab3b1a3561d
 F src/tclsqlite.c 0d3370e01cd3b313ed29ed6b0ba00423b4329de0
@ -137,7 +137,7 @@ F src/test_tclvar.c ea4500a60d663f7fdf18fd3210efc112e0c6e7f0
 F src/tokenize.c 0f0955ef7b8ab99ba2d3099faa89b80ccba3733a
 F src/trigger.c 420192efe3e6f03addf7897c60c3c8bf913d3493
 F src/update.c 6b10becb6235ea314ed245fbfbf8b38755e3166e
-F src/utf.c 01b2aba02b10d12903e9e1ff897215c9faf6b662
+F src/utf.c c152f99ddccc5e0214a9817aa07ab1b208b43f14
 F src/util.c 9e81d417fc60bd2fe156f8f2317aa4845bc6cc90
 F src/vacuum.c 8bd895d29e7074e78d4e80f948e35ddc9cf2beef
 F src/vdbe.c a58fe70f11078deb16f6825cc99f099d2fad4a7b
@ -520,7 +520,7 @@ F www/tclsqlite.tcl 8be95ee6dba05eabcd27a9d91331c803f2ce2130
 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
 F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5
-P 1924ba5207bdc8d503c17cd9460c1a9f9c357635
-R 6a3d5d19ad9da4a9718db45f3a6f4e18
+P 77ebc3feb089c28155cf20873fb4eabd26fa50c1
+R 4c6f94c5ade866798dc608d64060285b
 U drh
-Z f3b0c8bff800cc59d8eb156576c3d0e8
+Z 9a4a3510d0a6e206d28b34d524cb6b1e
--- a/manifest.uuid
+++ b/manifest.uuid
@ -1 +1 @@
-77ebc3feb089c28155cf20873fb4eabd26fa50c1
+9a059cb6bced5cdc950f7816602ac92d89a899be
--- a/src/func.c
+++ b/src/func.c
@ -16,7 +16,7 @@
 ** sqliteRegisterBuildinFunctions() found at the bottom of the file.
 ** All other code has file scope.
 **
-** $Id: func.c,v 1.161 2007/06/22 15:21:16 danielk1977 Exp $
+** $Id: func.c,v 1.162 2007/07/23 19:12:42 drh Exp $
 */
 #include "sqliteInt.h"
 #include <ctype.h>
@ -26,6 +26,7 @@
 #include "vdbeInt.h"
 #include "os.h"

+
 /*
 ** Return the collating function associated with a function.
 */
@ -397,15 +398,6 @@ static const struct compareInfo likeInfoNorm = { '%', '_',   0, 1 };
 ** is case sensitive causing 'a' LIKE 'A' to be false */
 static const struct compareInfo likeInfoAlt = { '%', '_',   0, 0 };

-/*
-** Read a single UTF-8 character and return its value.
-*/
-u32 sqlite3ReadUtf8(const unsigned char *z){
-  u32 c;
-  SQLITE_READ_UTF8(z, c);
-  return c;
-}
-
 /*
 ** Compare two UTF-8 strings for equality where the first string can
 ** potentially be a "glob" expression.  Return true (1) if they
@ -440,97 +432,102 @@ static int patternCompare(
  const struct compareInfo *pInfo, /* Information about how to do the compare */
  const int esc                    /* The escape character */
 ){
-  register int c;
+  int c, c2;
  int invert;
  int seen;
-  int c2;
  u8 matchOne = pInfo->matchOne;
  u8 matchAll = pInfo->matchAll;
  u8 matchSet = pInfo->matchSet;
  u8 noCase = pInfo->noCase; 
  int prevEscape = 0;     /* True if the previous character was 'escape' */

-  while( (c = *zPattern)!=0 ){
+  while( (c = sqlite3Utf8Read(zPattern,0,&zPattern))!=0 ){
    if( !prevEscape && c==matchAll ){
-      while( (c=zPattern[1]) == matchAll || c == matchOne ){
-        if( c==matchOne ){
-          if( *zString==0 ) return 0;
-          SQLITE_SKIP_UTF8(zString);
+      while( (c=sqlite3Utf8Read(zPattern,0,&zPattern)) == matchAll
+               || c == matchOne ){
+        if( c==matchOne && sqlite3Utf8Read(zString, 0, &zString)==0 ){
+          return 0;
        }
-        zPattern++;
      }
-      if( c && esc && sqlite3ReadUtf8(&zPattern[1])==esc ){
-        u8 const *zTemp = &zPattern[1];
-        SQLITE_SKIP_UTF8(zTemp);
-        c = *zTemp;
-      }
-      if( c==0 ) return 1;
-      if( c==matchSet ){
-        assert( esc==0 );   /* This is GLOB, not LIKE */
-        while( *zString && patternCompare(&zPattern[1],zString,pInfo,esc)==0 ){
+      if( c==0 ){
+        return 1;
+      }else if( c==esc ){
+        c = sqlite3Utf8Read(zPattern, 0, &zPattern);
+        if( c==0 ){
+          return 0;
+        }
+      }else if( c==matchSet ){
+        assert( esc==0 );         /* This is GLOB, not LIKE */
+        assert( matchSet<0x80 );  /* '[' is a single-byte character */
+        while( *zString && patternCompare(&zPattern[-1],zString,pInfo,esc)==0 ){
          SQLITE_SKIP_UTF8(zString);
        }
        return *zString!=0;
-      }else{
-        while( (c2 = *zString)!=0 ){
-          if( noCase ){
-            c2 = sqlite3UpperToLower[c2];
-            c = sqlite3UpperToLower[c];
-            while( c2 != 0 && c2 != c ){ c2 = sqlite3UpperToLower[*++zString]; }
-          }else{
-            while( c2 != 0 && c2 != c ){ c2 = *++zString; }
+      }
+      while( (c2 = sqlite3Utf8Read(zString,0,&zString))!=0 ){
+        if( noCase ){
+          c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2;
+          c = c<0x80 ? sqlite3UpperToLower[c] : c;
+          while( c2 != 0 && c2 != c ){
+            c2 = sqlite3Utf8Read(zString, 0, &zString);
+            if( c2<0x80 ) c2 = sqlite3UpperToLower[c2];
+          }
+        }else{
+          while( c2 != 0 && c2 != c ){
+            c2 = sqlite3Utf8Read(zString, 0, &zString);
          }
-          if( c2==0 ) return 0;
-          if( patternCompare(&zPattern[1],zString,pInfo,esc) ) return 1;
-          SQLITE_SKIP_UTF8(zString);
        }
+        if( c2==0 ) return 0;
+        if( patternCompare(zPattern,zString,pInfo,esc) ) return 1;
+      }
+      return 0;
+    }else if( !prevEscape && c==matchOne ){
+      if( sqlite3Utf8Read(zString, 0, &zString)==0 ){
        return 0;
      }
-    }else if( !prevEscape && c==matchOne ){
-      if( *zString==0 ) return 0;
-      SQLITE_SKIP_UTF8(zString);
-      zPattern++;
    }else if( c==matchSet ){
      int prior_c = 0;
      assert( esc==0 );    /* This only occurs for GLOB, not LIKE */
      seen = 0;
      invert = 0;
-      c = sqlite3ReadUtf8(zString);
+      c = sqlite3Utf8Read(zString, 0, &zString);
      if( c==0 ) return 0;
-      c2 = *++zPattern;
-      if( c2=='^' ){ invert = 1; c2 = *++zPattern; }
+      c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
+      if( c2=='^' ){
+        invert = 1;
+        c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
+      }
      if( c2==']' ){
        if( c==']' ) seen = 1;
-        c2 = *++zPattern;
+        c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
      }
-      while( (c2 = sqlite3ReadUtf8(zPattern))!=0 && c2!=']' ){
-        if( c2=='-' && zPattern[1]!=']' && zPattern[1]!=0 && prior_c>0 ){
-          zPattern++;
-          c2 = sqlite3ReadUtf8(zPattern);
+      while( c2 && c2!=']' ){
+        if( c2=='-' && zPattern[0]!=']' && zPattern[0]!=0 && prior_c>0 ){
+          c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
          if( c>=prior_c && c<=c2 ) seen = 1;
          prior_c = 0;
-        }else if( c==c2 ){
-          seen = 1;
-          prior_c = c2;
        }else{
+          if( c==c2 ){
+            seen = 1;
+          }
          prior_c = c2;
        }
-        SQLITE_SKIP_UTF8(zPattern);
+        c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
      }
-      if( c2==0 || (seen ^ invert)==0 ) return 0;
-      SQLITE_SKIP_UTF8(zString);
-      zPattern++;
-    }else if( esc && !prevEscape && sqlite3ReadUtf8(zPattern)==esc){
+      if( c2==0 || (seen ^ invert)==0 ){
+        return 0;
+      }
+    }else if( esc==c && !prevEscape ){
      prevEscape = 1;
-      SQLITE_SKIP_UTF8(zPattern);
    }else{
+      c2 = sqlite3Utf8Read(zString, 0, &zString);
      if( noCase ){
-        if( sqlite3UpperToLower[c] != sqlite3UpperToLower[*zString] ) return 0;
-      }else{
-        if( c != *zString ) return 0;
+        c = c<0x80 ? sqlite3UpperToLower[c] : c;
+        c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2;
+      }
+      if( c!=c2 ){
+        return 0;
      }
-      zPattern++;
-      zString++;
      prevEscape = 0;
    }
  }
@ -590,7 +587,7 @@ static void likeFunc(
          "ESCAPE expression must be a single character", -1);
      return;
    }
-    escape = sqlite3ReadUtf8(zEsc);
+    escape = sqlite3Utf8Read(zEsc, 0, &zEsc);
  }
  if( zA && zB ){
    struct compareInfo *pInfo = sqlite3_user_data(context);
--- a/src/sqliteInt.h
+++ b/src/sqliteInt.h
@ -11,7 +11,7 @@
 *************************************************************************
 ** Internal interface definitions for SQLite.
 **
-** @(#) $Id: sqliteInt.h,v 1.578 2007/06/26 10:38:55 danielk1977 Exp $
+** @(#) $Id: sqliteInt.h,v 1.579 2007/07/23 19:12:42 drh Exp $
 */
 #ifndef _SQLITEINT_H_
 #define _SQLITEINT_H_
@ -1556,62 +1556,15 @@ typedef struct {
 extern int sqlite3_always_code_trigger_setup;

 /*
-** A lookup table used by the SQLITE_READ_UTF8 macro.  The definition
-** is in utf.c.
+** Assuming zIn points to the first byte of a UTF-8 character,
+** advance zIn to point to the first byte of the next UTF-8 character.
 */
-extern const unsigned char sqlite3UtfTrans1[];
-
-/*
-** Macros for reading UTF8 characters.
-**
-** SQLITE_READ_UTF8(x,c) reads a single UTF8 value out of x and writes
-** that value into c.  The type of x must be unsigned char*.  The type
-** of c must be unsigned int.
-**
-** SQLITE_SKIP_UTF8(x) advances x forward by one character.  The type of
-** x must be unsigned char*.
-**
-** Notes On Invalid UTF-8:
-**
-**  *  These macros never allow a 7-bit character (0x00 through 0x7f) to
-**     be encoded as a multi-byte character.  Any multi-byte character that
-**     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
-**
-**  *  These macros never allow a UTF16 surrogate value to be encoded.
-**     If a multi-byte character attempts to encode a value between
-**     0xd800 and 0xe000 then it is rendered as 0xfffd.
-**
-**  *  Bytes in the range of 0x80 through 0xbf which occur as the first
-**     byte of a character are interpreted as single-byte characters
-**     and rendered as themselves even though they are technically
-**     invalid characters.
-**
-**  *  These routines accept an infinite number of different UTF8 encodings
-**     for unicode values 0x80 and greater.  They do not change over-length
-**     encodings to 0xfffd as some systems recommend.
-** 
-*/
-#define SQLITE_READ_UTF8(zIn, c) {                     \
-  c = *(zIn++);                                        \
-  if( c>=0xc0 ){                                       \
-    c = sqlite3UtfTrans1[c-0xc0];                      \
-    while( (*zIn & 0xc0)==0x80 ){                      \
-      c = (c<<6) + (0x3f & *(zIn++));                  \
-    }                                                  \
-    if( c<0x80                                         \
-        || (c&0xFFFFF800)==0xD800                      \
-        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }    \
-  }                                                    \
-}
 #define SQLITE_SKIP_UTF8(zIn) {                        \
  if( (*(zIn++))>=0xc0 ){                              \
    while( (*zIn & 0xc0)==0x80 ){ zIn++; }             \
  }                                                    \
 }

-
-
-
 /*
 ** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production
 ** builds) or a function call (for debugging).  If it is a function call,
@ -1830,7 +1783,7 @@ int sqlite3GetInt32(const char *, int*);
 int sqlite3FitsIn64Bits(const char *);
 int sqlite3Utf16ByteLen(const void *pData, int nChar);
 int sqlite3Utf8CharLen(const char *pData, int nByte);
-u32 sqlite3ReadUtf8(const unsigned char *);
+int sqlite3Utf8Read(const u8*, const u8*, const u8**);
 int sqlite3PutVarint(unsigned char *, u64);
 int sqlite3GetVarint(const unsigned char *, u64 *);
 int sqlite3GetVarint32(const unsigned char *, u32 *);
--- a/src/utf.c
+++ b/src/utf.c
@ -12,7 +12,7 @@
 ** This file contains routines used to translate between UTF-8, 
 ** UTF-16, UTF-16BE, and UTF-16LE.
 **
-** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $
+** $Id: utf.c,v 1.52 2007/07/23 19:12:42 drh Exp $
 **
 ** Notes on UTF-8:
 **
@ -60,6 +60,7 @@ const unsigned char sqlite3UtfTrans1[] = {
  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
 };

+
 #define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (c&0xFF);                                \
@ -126,6 +127,54 @@ const unsigned char sqlite3UtfTrans1[] = {
  }                                                                   \
 }

+/*
+** Translate a single UTF-8 character.  Return the unicode value.
+**
+** During translation, assume that the byte that zTerm points
+** is a 0x00.
+**
+** Write a pointer to the next unread byte back into *pzNext.
+**
+** Notes On Invalid UTF-8:
+**
+**  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
+**     be encoded as a multi-byte character.  Any multi-byte character that
+**     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
+**
+**  *  This routine never allows a UTF16 surrogate value to be encoded.
+**     If a multi-byte character attempts to encode a value between
+**     0xd800 and 0xe000 then it is rendered as 0xfffd.
+**
+**  *  Bytes in the range of 0x80 through 0xbf which occur as the first
+**     byte of a character are interpreted as single-byte characters
+**     and rendered as themselves even though they are technically
+**     invalid characters.
+**
+**  *  This routine accepts an infinite number of different UTF8 encodings
+**     for unicode values 0x80 and greater.  It do not change over-length
+**     encodings to 0xfffd as some systems recommend.
+*/
+int sqlite3Utf8Read(
+  const unsigned char *z,         /* First byte of UTF-8 character */
+  const unsigned char *zTerm,     /* Pretend this byte is 0x00 */
+  const unsigned char **pzNext    /* Write first byte past UTF-8 char here */
+){
+  int c = *(z++);
+  if( c>=0xc0 ){
+    c = sqlite3UtfTrans1[c-0xc0];
+    while( z!=zTerm && (*z & 0xc0)==0x80 ){
+      c = (c<<6) + (0x3f & *(z++));
+    }
+    if( c<0x80
+        || (c&0xFFFFF800)==0xD800
+        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }
+  }
+  *pzNext = z;
+  return c;
+}
+
+
+
 /*
 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
@ -219,81 +268,19 @@ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
-    unsigned int iExtra = 0xD800;
-
-    if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){
-      /* This UTF8 string is not nul-terminated, and the last byte is
-      ** not a character in the ascii range (codpoints 0..127). This
-      ** means the SQLITE_READ_UTF8() macro might read past the end
-      ** of the allocated buffer.
-      **
-      ** There are four possibilities:
-      **
-      **   1. The last byte is the first byte of a non-ASCII character,
-      **
-      **   2. The final N bytes of the input string are continuation bytes
-      **      and immediately preceding them is the first byte of a 
-      **      non-ASCII character.
-      **
-      **   3. The final N bytes of the input string are continuation bytes
-      **      and immediately preceding them is a byte that encodes a 
-      **      character in the ASCII range.
-      **
-      **   4. The entire string consists of continuation characters.
-      **
-      ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8()
-      ** macro will not overread the buffer in these cases.
-      */
-      unsigned char *zExtra = &zTerm[-1];
-      while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){
-        zExtra--;
-      }
-
-      if( (zExtra[0]&0xC0)==0xC0 ){
-        /* Make a copy of the last character encoding in the input string.
-        ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8()
-        ** to decode the codepoint. Store the codepoint in variable iExtra,
-        ** it will be appended to the output string later.
-        */
-        unsigned char *zFree = 0;
-        unsigned char zBuf[16];
-        int nExtra = (pMem->n+zIn-zExtra);
-        zTerm = zExtra;
-        if( nExtra>15 ){
-          zExtra = sqliteMallocRaw(nExtra+1);
-          if( !zExtra ){
-            return SQLITE_NOMEM;
-          }
-          zFree = zExtra;
-        }else{
-          zExtra = zBuf;
-        }
-        memcpy(zExtra, zTerm, nExtra);
-        zExtra[nExtra] = '\0';
-        SQLITE_READ_UTF8(zExtra, iExtra);
-        sqliteFree(zFree);
-      }
-    }
-
    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
-        SQLITE_READ_UTF8(zIn, c); 
+        c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
        WRITE_UTF16LE(z, c);
      }
-      if( iExtra!=0xD800 ){
-        WRITE_UTF16LE(z, iExtra);
-      }
    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
      while( zIn<zTerm ){
-        SQLITE_READ_UTF8(zIn, c); 
+        c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
        WRITE_UTF16BE(z, c);
      }
-      if( iExtra!=0xD800 ){
-        WRITE_UTF16BE(z, iExtra);
-      }
    }
    pMem->n = z - zOut;
    *z++ = 0;
@ -477,11 +464,11 @@ int sqlite3Utf16ByteLen(const void *zIn, int nChar){
 int sqlite3Utf8To8(unsigned char *zIn){
  unsigned char *zOut = zIn;
  unsigned char *zStart = zIn;
-  int c;
+  unsigned char *zTerm;
+  u32 c;

-  while(1){
-    SQLITE_READ_UTF8(zIn, c);
-    if( c==0 ) break;
+  while( zIn[0] ){
+    c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
    if( c!=0xfffd ){
      WRITE_UTF8(zOut, c);
    }
@ -501,6 +488,7 @@ void sqlite3UtfSelfTest(){
  unsigned int i, t;
  unsigned char zBuf[20];
  unsigned char *z;
+  unsigned char *zTerm;
  int n;
  unsigned int c;

@ -509,8 +497,9 @@ void sqlite3UtfSelfTest(){
    WRITE_UTF8(z, i);
    n = z-zBuf;
    z[0] = 0;
+    zTerm = z;
    z = zBuf;
-    SQLITE_READ_UTF8(z, c);
+    c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;