From 6615095629c2f0553180fa154a4b52b7e17b8803 Mon Sep 17 00:00:00 2001 From: drh Date: Mon, 23 Jul 2007 19:12:41 +0000 Subject: [PATCH] Rework the UTF8 reader logic in order to avoid the use of malloc(). Ticket #2523. (CVS 4175) FossilOrigin-Name: 9a059cb6bced5cdc950f7816602ac92d89a899be --- manifest | 16 +++--- manifest.uuid | 2 +- src/func.c | 127 +++++++++++++++++++++++------------------------ src/sqliteInt.h | 55 ++------------------- src/utf.c | 129 ++++++++++++++++++++++-------------------------- 5 files changed, 134 insertions(+), 195 deletions(-) diff --git a/manifest b/manifest index e4f8f88fba..23c330bff3 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\sa\sbad\ssizeof\sin\svdbe.c.\s\sTicket\s#2522.\s(CVS\s4174) -D 2007-07-22T19:10:21 +C Rework\sthe\sUTF8\sreader\slogic\sin\sorder\sto\savoid\sthe\suse\sof\smalloc().\nTicket\s#2523.\s(CVS\s4175) +D 2007-07-23T19:12:42 F Makefile.in 0c0e53720f658c7a551046442dd7afba0b72bfbe F Makefile.linux-gcc 65241babba6faf1152bf86574477baab19190499 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -78,7 +78,7 @@ F src/date.c 6049db7d5a8fdf2c677ff7d58fa31d4f6593c988 F src/delete.c 5c0d89b3ef7d48fe1f5124bfe8341f982747fe29 F src/experimental.c 1b2d1a6cd62ecc39610e97670332ca073c50792b F src/expr.c de9f55b1baed00199466028ad96967208d487798 -F src/func.c 6b45261aa2c514f642201b90493af68469c04af6 +F src/func.c dcba54fc18d2b2fd02f8b7c3dc13e27d100a4d8e F src/hash.c 67b23e14f0257b69a3e8aa663e4eeadc1a2b6fd5 F src/hash.h 1b3f7e2609141fd571f62199fc38687d262e9564 F src/insert.c 89d184422d85db0418e0f66032ccea3657078ecd @@ -111,7 +111,7 @@ F src/server.c 087b92a39d883e3fa113cae259d64e4c7438bc96 F src/shell.c e7534cce78398bc1cac4a643e931fc6221c2897e F src/sqlite.h.in 8164526b1658a6dad472953ea91239849f913d45 F src/sqlite3ext.h a27bedc222df5e5f0f458ac99726d0483b953a91 -F src/sqliteInt.h 81183ae71162818bf60478e738ff68604128bb06 +F src/sqliteInt.h 358f3a29b98e1efdd840a928dec8f60a51e6a33e F src/sqliteLimit.h f14609c27636ebc217c9603ade26dbdd7d0f6afa F src/table.c a8de75bcedf84d4060d804264b067ab3b1a3561d F src/tclsqlite.c 0d3370e01cd3b313ed29ed6b0ba00423b4329de0 @@ -137,7 +137,7 @@ F src/test_tclvar.c ea4500a60d663f7fdf18fd3210efc112e0c6e7f0 F src/tokenize.c 0f0955ef7b8ab99ba2d3099faa89b80ccba3733a F src/trigger.c 420192efe3e6f03addf7897c60c3c8bf913d3493 F src/update.c 6b10becb6235ea314ed245fbfbf8b38755e3166e -F src/utf.c 01b2aba02b10d12903e9e1ff897215c9faf6b662 +F src/utf.c c152f99ddccc5e0214a9817aa07ab1b208b43f14 F src/util.c 9e81d417fc60bd2fe156f8f2317aa4845bc6cc90 F src/vacuum.c 8bd895d29e7074e78d4e80f948e35ddc9cf2beef F src/vdbe.c a58fe70f11078deb16f6825cc99f099d2fad4a7b @@ -520,7 +520,7 @@ F www/tclsqlite.tcl 8be95ee6dba05eabcd27a9d91331c803f2ce2130 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5 -P 1924ba5207bdc8d503c17cd9460c1a9f9c357635 -R 6a3d5d19ad9da4a9718db45f3a6f4e18 +P 77ebc3feb089c28155cf20873fb4eabd26fa50c1 +R 4c6f94c5ade866798dc608d64060285b U drh -Z f3b0c8bff800cc59d8eb156576c3d0e8 +Z 9a4a3510d0a6e206d28b34d524cb6b1e diff --git a/manifest.uuid b/manifest.uuid index a8c8af68c2..acc9f8f520 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -77ebc3feb089c28155cf20873fb4eabd26fa50c1 \ No newline at end of file +9a059cb6bced5cdc950f7816602ac92d89a899be \ No newline at end of file diff --git a/src/func.c b/src/func.c index d76b38d7bb..694dd8d359 100644 --- a/src/func.c +++ b/src/func.c @@ -16,7 +16,7 @@ ** sqliteRegisterBuildinFunctions() found at the bottom of the file. ** All other code has file scope. ** -** $Id: func.c,v 1.161 2007/06/22 15:21:16 danielk1977 Exp $ +** $Id: func.c,v 1.162 2007/07/23 19:12:42 drh Exp $ */ #include "sqliteInt.h" #include @@ -26,6 +26,7 @@ #include "vdbeInt.h" #include "os.h" + /* ** Return the collating function associated with a function. */ @@ -397,15 +398,6 @@ static const struct compareInfo likeInfoNorm = { '%', '_', 0, 1 }; ** is case sensitive causing 'a' LIKE 'A' to be false */ static const struct compareInfo likeInfoAlt = { '%', '_', 0, 0 }; -/* -** Read a single UTF-8 character and return its value. -*/ -u32 sqlite3ReadUtf8(const unsigned char *z){ - u32 c; - SQLITE_READ_UTF8(z, c); - return c; -} - /* ** Compare two UTF-8 strings for equality where the first string can ** potentially be a "glob" expression. Return true (1) if they @@ -440,97 +432,102 @@ static int patternCompare( const struct compareInfo *pInfo, /* Information about how to do the compare */ const int esc /* The escape character */ ){ - register int c; + int c, c2; int invert; int seen; - int c2; u8 matchOne = pInfo->matchOne; u8 matchAll = pInfo->matchAll; u8 matchSet = pInfo->matchSet; u8 noCase = pInfo->noCase; int prevEscape = 0; /* True if the previous character was 'escape' */ - while( (c = *zPattern)!=0 ){ + while( (c = sqlite3Utf8Read(zPattern,0,&zPattern))!=0 ){ if( !prevEscape && c==matchAll ){ - while( (c=zPattern[1]) == matchAll || c == matchOne ){ - if( c==matchOne ){ - if( *zString==0 ) return 0; - SQLITE_SKIP_UTF8(zString); + while( (c=sqlite3Utf8Read(zPattern,0,&zPattern)) == matchAll + || c == matchOne ){ + if( c==matchOne && sqlite3Utf8Read(zString, 0, &zString)==0 ){ + return 0; } - zPattern++; } - if( c && esc && sqlite3ReadUtf8(&zPattern[1])==esc ){ - u8 const *zTemp = &zPattern[1]; - SQLITE_SKIP_UTF8(zTemp); - c = *zTemp; - } - if( c==0 ) return 1; - if( c==matchSet ){ - assert( esc==0 ); /* This is GLOB, not LIKE */ - while( *zString && patternCompare(&zPattern[1],zString,pInfo,esc)==0 ){ + if( c==0 ){ + return 1; + }else if( c==esc ){ + c = sqlite3Utf8Read(zPattern, 0, &zPattern); + if( c==0 ){ + return 0; + } + }else if( c==matchSet ){ + assert( esc==0 ); /* This is GLOB, not LIKE */ + assert( matchSet<0x80 ); /* '[' is a single-byte character */ + while( *zString && patternCompare(&zPattern[-1],zString,pInfo,esc)==0 ){ SQLITE_SKIP_UTF8(zString); } return *zString!=0; - }else{ - while( (c2 = *zString)!=0 ){ - if( noCase ){ - c2 = sqlite3UpperToLower[c2]; - c = sqlite3UpperToLower[c]; - while( c2 != 0 && c2 != c ){ c2 = sqlite3UpperToLower[*++zString]; } - }else{ - while( c2 != 0 && c2 != c ){ c2 = *++zString; } + } + while( (c2 = sqlite3Utf8Read(zString,0,&zString))!=0 ){ + if( noCase ){ + c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2; + c = c<0x80 ? sqlite3UpperToLower[c] : c; + while( c2 != 0 && c2 != c ){ + c2 = sqlite3Utf8Read(zString, 0, &zString); + if( c2<0x80 ) c2 = sqlite3UpperToLower[c2]; + } + }else{ + while( c2 != 0 && c2 != c ){ + c2 = sqlite3Utf8Read(zString, 0, &zString); } - if( c2==0 ) return 0; - if( patternCompare(&zPattern[1],zString,pInfo,esc) ) return 1; - SQLITE_SKIP_UTF8(zString); } + if( c2==0 ) return 0; + if( patternCompare(zPattern,zString,pInfo,esc) ) return 1; + } + return 0; + }else if( !prevEscape && c==matchOne ){ + if( sqlite3Utf8Read(zString, 0, &zString)==0 ){ return 0; } - }else if( !prevEscape && c==matchOne ){ - if( *zString==0 ) return 0; - SQLITE_SKIP_UTF8(zString); - zPattern++; }else if( c==matchSet ){ int prior_c = 0; assert( esc==0 ); /* This only occurs for GLOB, not LIKE */ seen = 0; invert = 0; - c = sqlite3ReadUtf8(zString); + c = sqlite3Utf8Read(zString, 0, &zString); if( c==0 ) return 0; - c2 = *++zPattern; - if( c2=='^' ){ invert = 1; c2 = *++zPattern; } + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); + if( c2=='^' ){ + invert = 1; + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); + } if( c2==']' ){ if( c==']' ) seen = 1; - c2 = *++zPattern; + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); } - while( (c2 = sqlite3ReadUtf8(zPattern))!=0 && c2!=']' ){ - if( c2=='-' && zPattern[1]!=']' && zPattern[1]!=0 && prior_c>0 ){ - zPattern++; - c2 = sqlite3ReadUtf8(zPattern); + while( c2 && c2!=']' ){ + if( c2=='-' && zPattern[0]!=']' && zPattern[0]!=0 && prior_c>0 ){ + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); if( c>=prior_c && c<=c2 ) seen = 1; prior_c = 0; - }else if( c==c2 ){ - seen = 1; - prior_c = c2; }else{ + if( c==c2 ){ + seen = 1; + } prior_c = c2; } - SQLITE_SKIP_UTF8(zPattern); + c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); } - if( c2==0 || (seen ^ invert)==0 ) return 0; - SQLITE_SKIP_UTF8(zString); - zPattern++; - }else if( esc && !prevEscape && sqlite3ReadUtf8(zPattern)==esc){ + if( c2==0 || (seen ^ invert)==0 ){ + return 0; + } + }else if( esc==c && !prevEscape ){ prevEscape = 1; - SQLITE_SKIP_UTF8(zPattern); }else{ + c2 = sqlite3Utf8Read(zString, 0, &zString); if( noCase ){ - if( sqlite3UpperToLower[c] != sqlite3UpperToLower[*zString] ) return 0; - }else{ - if( c != *zString ) return 0; + c = c<0x80 ? sqlite3UpperToLower[c] : c; + c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2; + } + if( c!=c2 ){ + return 0; } - zPattern++; - zString++; prevEscape = 0; } } @@ -590,7 +587,7 @@ static void likeFunc( "ESCAPE expression must be a single character", -1); return; } - escape = sqlite3ReadUtf8(zEsc); + escape = sqlite3Utf8Read(zEsc, 0, &zEsc); } if( zA && zB ){ struct compareInfo *pInfo = sqlite3_user_data(context); diff --git a/src/sqliteInt.h b/src/sqliteInt.h index 9bc0f32c66..b2a9f3f0e8 100644 --- a/src/sqliteInt.h +++ b/src/sqliteInt.h @@ -11,7 +11,7 @@ ************************************************************************* ** Internal interface definitions for SQLite. ** -** @(#) $Id: sqliteInt.h,v 1.578 2007/06/26 10:38:55 danielk1977 Exp $ +** @(#) $Id: sqliteInt.h,v 1.579 2007/07/23 19:12:42 drh Exp $ */ #ifndef _SQLITEINT_H_ #define _SQLITEINT_H_ @@ -1556,62 +1556,15 @@ typedef struct { extern int sqlite3_always_code_trigger_setup; /* -** A lookup table used by the SQLITE_READ_UTF8 macro. The definition -** is in utf.c. +** Assuming zIn points to the first byte of a UTF-8 character, +** advance zIn to point to the first byte of the next UTF-8 character. */ -extern const unsigned char sqlite3UtfTrans1[]; - -/* -** Macros for reading UTF8 characters. -** -** SQLITE_READ_UTF8(x,c) reads a single UTF8 value out of x and writes -** that value into c. The type of x must be unsigned char*. The type -** of c must be unsigned int. -** -** SQLITE_SKIP_UTF8(x) advances x forward by one character. The type of -** x must be unsigned char*. -** -** Notes On Invalid UTF-8: -** -** * These macros never allow a 7-bit character (0x00 through 0x7f) to -** be encoded as a multi-byte character. Any multi-byte character that -** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd. -** -** * These macros never allow a UTF16 surrogate value to be encoded. -** If a multi-byte character attempts to encode a value between -** 0xd800 and 0xe000 then it is rendered as 0xfffd. -** -** * Bytes in the range of 0x80 through 0xbf which occur as the first -** byte of a character are interpreted as single-byte characters -** and rendered as themselves even though they are technically -** invalid characters. -** -** * These routines accept an infinite number of different UTF8 encodings -** for unicode values 0x80 and greater. They do not change over-length -** encodings to 0xfffd as some systems recommend. -** -*/ -#define SQLITE_READ_UTF8(zIn, c) { \ - c = *(zIn++); \ - if( c>=0xc0 ){ \ - c = sqlite3UtfTrans1[c-0xc0]; \ - while( (*zIn & 0xc0)==0x80 ){ \ - c = (c<<6) + (0x3f & *(zIn++)); \ - } \ - if( c<0x80 \ - || (c&0xFFFFF800)==0xD800 \ - || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ - } \ -} #define SQLITE_SKIP_UTF8(zIn) { \ if( (*(zIn++))>=0xc0 ){ \ while( (*zIn & 0xc0)==0x80 ){ zIn++; } \ } \ } - - - /* ** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production ** builds) or a function call (for debugging). If it is a function call, @@ -1830,7 +1783,7 @@ int sqlite3GetInt32(const char *, int*); int sqlite3FitsIn64Bits(const char *); int sqlite3Utf16ByteLen(const void *pData, int nChar); int sqlite3Utf8CharLen(const char *pData, int nByte); -u32 sqlite3ReadUtf8(const unsigned char *); +int sqlite3Utf8Read(const u8*, const u8*, const u8**); int sqlite3PutVarint(unsigned char *, u64); int sqlite3GetVarint(const unsigned char *, u64 *); int sqlite3GetVarint32(const unsigned char *, u32 *); diff --git a/src/utf.c b/src/utf.c index 6a8f1c6bc6..fe33e02c39 100644 --- a/src/utf.c +++ b/src/utf.c @@ -12,7 +12,7 @@ ** This file contains routines used to translate between UTF-8, ** UTF-16, UTF-16BE, and UTF-16LE. ** -** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $ +** $Id: utf.c,v 1.52 2007/07/23 19:12:42 drh Exp $ ** ** Notes on UTF-8: ** @@ -60,6 +60,7 @@ const unsigned char sqlite3UtfTrans1[] = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, }; + #define WRITE_UTF8(zOut, c) { \ if( c<0x00080 ){ \ *zOut++ = (c&0xFF); \ @@ -126,6 +127,54 @@ const unsigned char sqlite3UtfTrans1[] = { } \ } +/* +** Translate a single UTF-8 character. Return the unicode value. +** +** During translation, assume that the byte that zTerm points +** is a 0x00. +** +** Write a pointer to the next unread byte back into *pzNext. +** +** Notes On Invalid UTF-8: +** +** * This routine never allows a 7-bit character (0x00 through 0x7f) to +** be encoded as a multi-byte character. Any multi-byte character that +** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd. +** +** * This routine never allows a UTF16 surrogate value to be encoded. +** If a multi-byte character attempts to encode a value between +** 0xd800 and 0xe000 then it is rendered as 0xfffd. +** +** * Bytes in the range of 0x80 through 0xbf which occur as the first +** byte of a character are interpreted as single-byte characters +** and rendered as themselves even though they are technically +** invalid characters. +** +** * This routine accepts an infinite number of different UTF8 encodings +** for unicode values 0x80 and greater. It do not change over-length +** encodings to 0xfffd as some systems recommend. +*/ +int sqlite3Utf8Read( + const unsigned char *z, /* First byte of UTF-8 character */ + const unsigned char *zTerm, /* Pretend this byte is 0x00 */ + const unsigned char **pzNext /* Write first byte past UTF-8 char here */ +){ + int c = *(z++); + if( c>=0xc0 ){ + c = sqlite3UtfTrans1[c-0xc0]; + while( z!=zTerm && (*z & 0xc0)==0x80 ){ + c = (c<<6) + (0x3f & *(z++)); + } + if( c<0x80 + || (c&0xFFFFF800)==0xD800 + || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } + } + *pzNext = z; + return c; +} + + + /* ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). @@ -219,81 +268,19 @@ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ z = zOut; if( pMem->enc==SQLITE_UTF8 ){ - unsigned int iExtra = 0xD800; - - if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){ - /* This UTF8 string is not nul-terminated, and the last byte is - ** not a character in the ascii range (codpoints 0..127). This - ** means the SQLITE_READ_UTF8() macro might read past the end - ** of the allocated buffer. - ** - ** There are four possibilities: - ** - ** 1. The last byte is the first byte of a non-ASCII character, - ** - ** 2. The final N bytes of the input string are continuation bytes - ** and immediately preceding them is the first byte of a - ** non-ASCII character. - ** - ** 3. The final N bytes of the input string are continuation bytes - ** and immediately preceding them is a byte that encodes a - ** character in the ASCII range. - ** - ** 4. The entire string consists of continuation characters. - ** - ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8() - ** macro will not overread the buffer in these cases. - */ - unsigned char *zExtra = &zTerm[-1]; - while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){ - zExtra--; - } - - if( (zExtra[0]&0xC0)==0xC0 ){ - /* Make a copy of the last character encoding in the input string. - ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8() - ** to decode the codepoint. Store the codepoint in variable iExtra, - ** it will be appended to the output string later. - */ - unsigned char *zFree = 0; - unsigned char zBuf[16]; - int nExtra = (pMem->n+zIn-zExtra); - zTerm = zExtra; - if( nExtra>15 ){ - zExtra = sqliteMallocRaw(nExtra+1); - if( !zExtra ){ - return SQLITE_NOMEM; - } - zFree = zExtra; - }else{ - zExtra = zBuf; - } - memcpy(zExtra, zTerm, nExtra); - zExtra[nExtra] = '\0'; - SQLITE_READ_UTF8(zExtra, iExtra); - sqliteFree(zFree); - } - } - if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn UTF-16 Big-endian */ while( zInn = z - zOut; *z++ = 0; @@ -477,11 +464,11 @@ int sqlite3Utf16ByteLen(const void *zIn, int nChar){ int sqlite3Utf8To8(unsigned char *zIn){ unsigned char *zOut = zIn; unsigned char *zStart = zIn; - int c; + unsigned char *zTerm; + u32 c; - while(1){ - SQLITE_READ_UTF8(zIn, c); - if( c==0 ) break; + while( zIn[0] ){ + c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); if( c!=0xfffd ){ WRITE_UTF8(zOut, c); } @@ -501,6 +488,7 @@ void sqlite3UtfSelfTest(){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; + unsigned char *zTerm; int n; unsigned int c; @@ -509,8 +497,9 @@ void sqlite3UtfSelfTest(){ WRITE_UTF8(z, i); n = z-zBuf; z[0] = 0; + zTerm = z; z = zBuf; - SQLITE_READ_UTF8(z, c); + c = sqlite3Utf8Read(z, zTerm, (const u8**)&z); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;