Rework the UTF8 reader logic in order to avoid the use of malloc().

Ticket #2523. (CVS 4175)

FossilOrigin-Name: 9a059cb6bced5cdc950f7816602ac92d89a899be
This commit is contained in:
drh 2007-07-23 19:12:41 +00:00
parent ad6b3159be
commit 6615095629
5 changed files with 134 additions and 195 deletions

View File

@ -1,5 +1,5 @@
C Fix\sa\sbad\ssizeof\sin\svdbe.c.\s\sTicket\s#2522.\s(CVS\s4174)
D 2007-07-22T19:10:21
C Rework\sthe\sUTF8\sreader\slogic\sin\sorder\sto\savoid\sthe\suse\sof\smalloc().\nTicket\s#2523.\s(CVS\s4175)
D 2007-07-23T19:12:42
F Makefile.in 0c0e53720f658c7a551046442dd7afba0b72bfbe
F Makefile.linux-gcc 65241babba6faf1152bf86574477baab19190499
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -78,7 +78,7 @@ F src/date.c 6049db7d5a8fdf2c677ff7d58fa31d4f6593c988
F src/delete.c 5c0d89b3ef7d48fe1f5124bfe8341f982747fe29
F src/experimental.c 1b2d1a6cd62ecc39610e97670332ca073c50792b
F src/expr.c de9f55b1baed00199466028ad96967208d487798
F src/func.c 6b45261aa2c514f642201b90493af68469c04af6
F src/func.c dcba54fc18d2b2fd02f8b7c3dc13e27d100a4d8e
F src/hash.c 67b23e14f0257b69a3e8aa663e4eeadc1a2b6fd5
F src/hash.h 1b3f7e2609141fd571f62199fc38687d262e9564
F src/insert.c 89d184422d85db0418e0f66032ccea3657078ecd
@ -111,7 +111,7 @@ F src/server.c 087b92a39d883e3fa113cae259d64e4c7438bc96
F src/shell.c e7534cce78398bc1cac4a643e931fc6221c2897e
F src/sqlite.h.in 8164526b1658a6dad472953ea91239849f913d45
F src/sqlite3ext.h a27bedc222df5e5f0f458ac99726d0483b953a91
F src/sqliteInt.h 81183ae71162818bf60478e738ff68604128bb06
F src/sqliteInt.h 358f3a29b98e1efdd840a928dec8f60a51e6a33e
F src/sqliteLimit.h f14609c27636ebc217c9603ade26dbdd7d0f6afa
F src/table.c a8de75bcedf84d4060d804264b067ab3b1a3561d
F src/tclsqlite.c 0d3370e01cd3b313ed29ed6b0ba00423b4329de0
@ -137,7 +137,7 @@ F src/test_tclvar.c ea4500a60d663f7fdf18fd3210efc112e0c6e7f0
F src/tokenize.c 0f0955ef7b8ab99ba2d3099faa89b80ccba3733a
F src/trigger.c 420192efe3e6f03addf7897c60c3c8bf913d3493
F src/update.c 6b10becb6235ea314ed245fbfbf8b38755e3166e
F src/utf.c 01b2aba02b10d12903e9e1ff897215c9faf6b662
F src/utf.c c152f99ddccc5e0214a9817aa07ab1b208b43f14
F src/util.c 9e81d417fc60bd2fe156f8f2317aa4845bc6cc90
F src/vacuum.c 8bd895d29e7074e78d4e80f948e35ddc9cf2beef
F src/vdbe.c a58fe70f11078deb16f6825cc99f099d2fad4a7b
@ -520,7 +520,7 @@ F www/tclsqlite.tcl 8be95ee6dba05eabcd27a9d91331c803f2ce2130
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5
P 1924ba5207bdc8d503c17cd9460c1a9f9c357635
R 6a3d5d19ad9da4a9718db45f3a6f4e18
P 77ebc3feb089c28155cf20873fb4eabd26fa50c1
R 4c6f94c5ade866798dc608d64060285b
U drh
Z f3b0c8bff800cc59d8eb156576c3d0e8
Z 9a4a3510d0a6e206d28b34d524cb6b1e

View File

@ -1 +1 @@
77ebc3feb089c28155cf20873fb4eabd26fa50c1
9a059cb6bced5cdc950f7816602ac92d89a899be

View File

@ -16,7 +16,7 @@
** sqliteRegisterBuildinFunctions() found at the bottom of the file.
** All other code has file scope.
**
** $Id: func.c,v 1.161 2007/06/22 15:21:16 danielk1977 Exp $
** $Id: func.c,v 1.162 2007/07/23 19:12:42 drh Exp $
*/
#include "sqliteInt.h"
#include <ctype.h>
@ -26,6 +26,7 @@
#include "vdbeInt.h"
#include "os.h"
/*
** Return the collating function associated with a function.
*/
@ -397,15 +398,6 @@ static const struct compareInfo likeInfoNorm = { '%', '_', 0, 1 };
** is case sensitive causing 'a' LIKE 'A' to be false */
static const struct compareInfo likeInfoAlt = { '%', '_', 0, 0 };
/*
** Read a single UTF-8 character and return its value.
*/
u32 sqlite3ReadUtf8(const unsigned char *z){
u32 c;
SQLITE_READ_UTF8(z, c);
return c;
}
/*
** Compare two UTF-8 strings for equality where the first string can
** potentially be a "glob" expression. Return true (1) if they
@ -440,97 +432,102 @@ static int patternCompare(
const struct compareInfo *pInfo, /* Information about how to do the compare */
const int esc /* The escape character */
){
register int c;
int c, c2;
int invert;
int seen;
int c2;
u8 matchOne = pInfo->matchOne;
u8 matchAll = pInfo->matchAll;
u8 matchSet = pInfo->matchSet;
u8 noCase = pInfo->noCase;
int prevEscape = 0; /* True if the previous character was 'escape' */
while( (c = *zPattern)!=0 ){
while( (c = sqlite3Utf8Read(zPattern,0,&zPattern))!=0 ){
if( !prevEscape && c==matchAll ){
while( (c=zPattern[1]) == matchAll || c == matchOne ){
if( c==matchOne ){
if( *zString==0 ) return 0;
SQLITE_SKIP_UTF8(zString);
while( (c=sqlite3Utf8Read(zPattern,0,&zPattern)) == matchAll
|| c == matchOne ){
if( c==matchOne && sqlite3Utf8Read(zString, 0, &zString)==0 ){
return 0;
}
zPattern++;
}
if( c && esc && sqlite3ReadUtf8(&zPattern[1])==esc ){
u8 const *zTemp = &zPattern[1];
SQLITE_SKIP_UTF8(zTemp);
c = *zTemp;
}
if( c==0 ) return 1;
if( c==matchSet ){
assert( esc==0 ); /* This is GLOB, not LIKE */
while( *zString && patternCompare(&zPattern[1],zString,pInfo,esc)==0 ){
if( c==0 ){
return 1;
}else if( c==esc ){
c = sqlite3Utf8Read(zPattern, 0, &zPattern);
if( c==0 ){
return 0;
}
}else if( c==matchSet ){
assert( esc==0 ); /* This is GLOB, not LIKE */
assert( matchSet<0x80 ); /* '[' is a single-byte character */
while( *zString && patternCompare(&zPattern[-1],zString,pInfo,esc)==0 ){
SQLITE_SKIP_UTF8(zString);
}
return *zString!=0;
}else{
while( (c2 = *zString)!=0 ){
if( noCase ){
c2 = sqlite3UpperToLower[c2];
c = sqlite3UpperToLower[c];
while( c2 != 0 && c2 != c ){ c2 = sqlite3UpperToLower[*++zString]; }
}else{
while( c2 != 0 && c2 != c ){ c2 = *++zString; }
}
while( (c2 = sqlite3Utf8Read(zString,0,&zString))!=0 ){
if( noCase ){
c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2;
c = c<0x80 ? sqlite3UpperToLower[c] : c;
while( c2 != 0 && c2 != c ){
c2 = sqlite3Utf8Read(zString, 0, &zString);
if( c2<0x80 ) c2 = sqlite3UpperToLower[c2];
}
}else{
while( c2 != 0 && c2 != c ){
c2 = sqlite3Utf8Read(zString, 0, &zString);
}
if( c2==0 ) return 0;
if( patternCompare(&zPattern[1],zString,pInfo,esc) ) return 1;
SQLITE_SKIP_UTF8(zString);
}
if( c2==0 ) return 0;
if( patternCompare(zPattern,zString,pInfo,esc) ) return 1;
}
return 0;
}else if( !prevEscape && c==matchOne ){
if( sqlite3Utf8Read(zString, 0, &zString)==0 ){
return 0;
}
}else if( !prevEscape && c==matchOne ){
if( *zString==0 ) return 0;
SQLITE_SKIP_UTF8(zString);
zPattern++;
}else if( c==matchSet ){
int prior_c = 0;
assert( esc==0 ); /* This only occurs for GLOB, not LIKE */
seen = 0;
invert = 0;
c = sqlite3ReadUtf8(zString);
c = sqlite3Utf8Read(zString, 0, &zString);
if( c==0 ) return 0;
c2 = *++zPattern;
if( c2=='^' ){ invert = 1; c2 = *++zPattern; }
c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
if( c2=='^' ){
invert = 1;
c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
}
if( c2==']' ){
if( c==']' ) seen = 1;
c2 = *++zPattern;
c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
}
while( (c2 = sqlite3ReadUtf8(zPattern))!=0 && c2!=']' ){
if( c2=='-' && zPattern[1]!=']' && zPattern[1]!=0 && prior_c>0 ){
zPattern++;
c2 = sqlite3ReadUtf8(zPattern);
while( c2 && c2!=']' ){
if( c2=='-' && zPattern[0]!=']' && zPattern[0]!=0 && prior_c>0 ){
c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
if( c>=prior_c && c<=c2 ) seen = 1;
prior_c = 0;
}else if( c==c2 ){
seen = 1;
prior_c = c2;
}else{
if( c==c2 ){
seen = 1;
}
prior_c = c2;
}
SQLITE_SKIP_UTF8(zPattern);
c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
}
if( c2==0 || (seen ^ invert)==0 ) return 0;
SQLITE_SKIP_UTF8(zString);
zPattern++;
}else if( esc && !prevEscape && sqlite3ReadUtf8(zPattern)==esc){
if( c2==0 || (seen ^ invert)==0 ){
return 0;
}
}else if( esc==c && !prevEscape ){
prevEscape = 1;
SQLITE_SKIP_UTF8(zPattern);
}else{
c2 = sqlite3Utf8Read(zString, 0, &zString);
if( noCase ){
if( sqlite3UpperToLower[c] != sqlite3UpperToLower[*zString] ) return 0;
}else{
if( c != *zString ) return 0;
c = c<0x80 ? sqlite3UpperToLower[c] : c;
c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2;
}
if( c!=c2 ){
return 0;
}
zPattern++;
zString++;
prevEscape = 0;
}
}
@ -590,7 +587,7 @@ static void likeFunc(
"ESCAPE expression must be a single character", -1);
return;
}
escape = sqlite3ReadUtf8(zEsc);
escape = sqlite3Utf8Read(zEsc, 0, &zEsc);
}
if( zA && zB ){
struct compareInfo *pInfo = sqlite3_user_data(context);

View File

@ -11,7 +11,7 @@
*************************************************************************
** Internal interface definitions for SQLite.
**
** @(#) $Id: sqliteInt.h,v 1.578 2007/06/26 10:38:55 danielk1977 Exp $
** @(#) $Id: sqliteInt.h,v 1.579 2007/07/23 19:12:42 drh Exp $
*/
#ifndef _SQLITEINT_H_
#define _SQLITEINT_H_
@ -1556,62 +1556,15 @@ typedef struct {
extern int sqlite3_always_code_trigger_setup;
/*
** A lookup table used by the SQLITE_READ_UTF8 macro. The definition
** is in utf.c.
** Assuming zIn points to the first byte of a UTF-8 character,
** advance zIn to point to the first byte of the next UTF-8 character.
*/
extern const unsigned char sqlite3UtfTrans1[];
/*
** Macros for reading UTF8 characters.
**
** SQLITE_READ_UTF8(x,c) reads a single UTF8 value out of x and writes
** that value into c. The type of x must be unsigned char*. The type
** of c must be unsigned int.
**
** SQLITE_SKIP_UTF8(x) advances x forward by one character. The type of
** x must be unsigned char*.
**
** Notes On Invalid UTF-8:
**
** * These macros never allow a 7-bit character (0x00 through 0x7f) to
** be encoded as a multi-byte character. Any multi-byte character that
** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
**
** * These macros never allow a UTF16 surrogate value to be encoded.
** If a multi-byte character attempts to encode a value between
** 0xd800 and 0xe000 then it is rendered as 0xfffd.
**
** * Bytes in the range of 0x80 through 0xbf which occur as the first
** byte of a character are interpreted as single-byte characters
** and rendered as themselves even though they are technically
** invalid characters.
**
** * These routines accept an infinite number of different UTF8 encodings
** for unicode values 0x80 and greater. They do not change over-length
** encodings to 0xfffd as some systems recommend.
**
*/
#define SQLITE_READ_UTF8(zIn, c) { \
c = *(zIn++); \
if( c>=0xc0 ){ \
c = sqlite3UtfTrans1[c-0xc0]; \
while( (*zIn & 0xc0)==0x80 ){ \
c = (c<<6) + (0x3f & *(zIn++)); \
} \
if( c<0x80 \
|| (c&0xFFFFF800)==0xD800 \
|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
} \
}
#define SQLITE_SKIP_UTF8(zIn) { \
if( (*(zIn++))>=0xc0 ){ \
while( (*zIn & 0xc0)==0x80 ){ zIn++; } \
} \
}
/*
** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production
** builds) or a function call (for debugging). If it is a function call,
@ -1830,7 +1783,7 @@ int sqlite3GetInt32(const char *, int*);
int sqlite3FitsIn64Bits(const char *);
int sqlite3Utf16ByteLen(const void *pData, int nChar);
int sqlite3Utf8CharLen(const char *pData, int nByte);
u32 sqlite3ReadUtf8(const unsigned char *);
int sqlite3Utf8Read(const u8*, const u8*, const u8**);
int sqlite3PutVarint(unsigned char *, u64);
int sqlite3GetVarint(const unsigned char *, u64 *);
int sqlite3GetVarint32(const unsigned char *, u32 *);

129
src/utf.c
View File

@ -12,7 +12,7 @@
** This file contains routines used to translate between UTF-8,
** UTF-16, UTF-16BE, and UTF-16LE.
**
** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $
** $Id: utf.c,v 1.52 2007/07/23 19:12:42 drh Exp $
**
** Notes on UTF-8:
**
@ -60,6 +60,7 @@ const unsigned char sqlite3UtfTrans1[] = {
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
};
#define WRITE_UTF8(zOut, c) { \
if( c<0x00080 ){ \
*zOut++ = (c&0xFF); \
@ -126,6 +127,54 @@ const unsigned char sqlite3UtfTrans1[] = {
} \
}
/*
** Translate a single UTF-8 character. Return the unicode value.
**
** During translation, assume that the byte that zTerm points
** is a 0x00.
**
** Write a pointer to the next unread byte back into *pzNext.
**
** Notes On Invalid UTF-8:
**
** * This routine never allows a 7-bit character (0x00 through 0x7f) to
** be encoded as a multi-byte character. Any multi-byte character that
** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
**
** * This routine never allows a UTF16 surrogate value to be encoded.
** If a multi-byte character attempts to encode a value between
** 0xd800 and 0xe000 then it is rendered as 0xfffd.
**
** * Bytes in the range of 0x80 through 0xbf which occur as the first
** byte of a character are interpreted as single-byte characters
** and rendered as themselves even though they are technically
** invalid characters.
**
** * This routine accepts an infinite number of different UTF8 encodings
** for unicode values 0x80 and greater. It do not change over-length
** encodings to 0xfffd as some systems recommend.
*/
int sqlite3Utf8Read(
const unsigned char *z, /* First byte of UTF-8 character */
const unsigned char *zTerm, /* Pretend this byte is 0x00 */
const unsigned char **pzNext /* Write first byte past UTF-8 char here */
){
int c = *(z++);
if( c>=0xc0 ){
c = sqlite3UtfTrans1[c-0xc0];
while( z!=zTerm && (*z & 0xc0)==0x80 ){
c = (c<<6) + (0x3f & *(z++));
}
if( c<0x80
|| (c&0xFFFFF800)==0xD800
|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
}
*pzNext = z;
return c;
}
/*
** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
@ -219,81 +268,19 @@ int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
z = zOut;
if( pMem->enc==SQLITE_UTF8 ){
unsigned int iExtra = 0xD800;
if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){
/* This UTF8 string is not nul-terminated, and the last byte is
** not a character in the ascii range (codpoints 0..127). This
** means the SQLITE_READ_UTF8() macro might read past the end
** of the allocated buffer.
**
** There are four possibilities:
**
** 1. The last byte is the first byte of a non-ASCII character,
**
** 2. The final N bytes of the input string are continuation bytes
** and immediately preceding them is the first byte of a
** non-ASCII character.
**
** 3. The final N bytes of the input string are continuation bytes
** and immediately preceding them is a byte that encodes a
** character in the ASCII range.
**
** 4. The entire string consists of continuation characters.
**
** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8()
** macro will not overread the buffer in these cases.
*/
unsigned char *zExtra = &zTerm[-1];
while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){
zExtra--;
}
if( (zExtra[0]&0xC0)==0xC0 ){
/* Make a copy of the last character encoding in the input string.
** Then make sure it is nul-terminated and use SQLITE_READ_UTF8()
** to decode the codepoint. Store the codepoint in variable iExtra,
** it will be appended to the output string later.
*/
unsigned char *zFree = 0;
unsigned char zBuf[16];
int nExtra = (pMem->n+zIn-zExtra);
zTerm = zExtra;
if( nExtra>15 ){
zExtra = sqliteMallocRaw(nExtra+1);
if( !zExtra ){
return SQLITE_NOMEM;
}
zFree = zExtra;
}else{
zExtra = zBuf;
}
memcpy(zExtra, zTerm, nExtra);
zExtra[nExtra] = '\0';
SQLITE_READ_UTF8(zExtra, iExtra);
sqliteFree(zFree);
}
}
if( desiredEnc==SQLITE_UTF16LE ){
/* UTF-8 -> UTF-16 Little-endian */
while( zIn<zTerm ){
SQLITE_READ_UTF8(zIn, c);
c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
WRITE_UTF16LE(z, c);
}
if( iExtra!=0xD800 ){
WRITE_UTF16LE(z, iExtra);
}
}else{
assert( desiredEnc==SQLITE_UTF16BE );
/* UTF-8 -> UTF-16 Big-endian */
while( zIn<zTerm ){
SQLITE_READ_UTF8(zIn, c);
c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
WRITE_UTF16BE(z, c);
}
if( iExtra!=0xD800 ){
WRITE_UTF16BE(z, iExtra);
}
}
pMem->n = z - zOut;
*z++ = 0;
@ -477,11 +464,11 @@ int sqlite3Utf16ByteLen(const void *zIn, int nChar){
int sqlite3Utf8To8(unsigned char *zIn){
unsigned char *zOut = zIn;
unsigned char *zStart = zIn;
int c;
unsigned char *zTerm;
u32 c;
while(1){
SQLITE_READ_UTF8(zIn, c);
if( c==0 ) break;
while( zIn[0] ){
c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
if( c!=0xfffd ){
WRITE_UTF8(zOut, c);
}
@ -501,6 +488,7 @@ void sqlite3UtfSelfTest(){
unsigned int i, t;
unsigned char zBuf[20];
unsigned char *z;
unsigned char *zTerm;
int n;
unsigned int c;
@ -509,8 +497,9 @@ void sqlite3UtfSelfTest(){
WRITE_UTF8(z, i);
n = z-zBuf;
z[0] = 0;
zTerm = z;
z = zBuf;
SQLITE_READ_UTF8(z, c);
c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);
t = i;
if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;