Steps towards UTF-16 databases. Some tests are failing because of this

commit. (CVS 1433) FossilOrigin-Name: c4a8246864eee7cb993ab7b703324d92c284d72a
2004-05-22 03:05:33 +00:00 · 2004-05-22 03:05:33 +00:00 · b1bc95315b
commit b1bc95315b
parent ab01f61ab8
9 changed files with 352 additions and 189 deletions
--- a/26
+++ b/26
@ -1,5 +1,5 @@
-C Update\scomments\sand\sremove\sdead\scode\sfrom\sbtree.c\s(CVS\s1432)
-D 2004-05-22T02:55:23
+C Steps\stowards\sUTF-16\sdatabases.\sSome\stests\sare\sfailing\sbecause\sof\sthis\ncommit.\s(CVS\s1433)
+D 2004-05-22T03:05:34
 F Makefile.in ab7b0d5118e2da97bac66be8684a1034e3500f5a
 F Makefile.linux-gcc b86a99c493a5bfb402d1d9178dcdc4bd4b32f906
 F README f1de682fbbd94899d50aca13d387d1b3fd3be2dd
@ -37,7 +37,7 @@ F src/func.c cfbb7096efb58e2857e3b312a8958a12774b625a
 F src/hash.c 440c2f8cb373ee1b4e13a0988489c7cd95d55b6f
 F src/hash.h 762d95f1e567664d1eafc1687de755626be962fb
 F src/insert.c e510d62d23b4de4d901e7ccbbe7833b7fb3b9570
-F src/main.c 5604d5a9a6b31720b95e6a2cb4c804c53592f145
+F src/main.c a2be4b3976818f3fe5dfdc5709c330599da7acc3
 F src/md5.c 8e39fdae6d8776b87558e91dcc94740c9b635a9c
 F src/os.c ddcda92f7fd71b4513c57c1ec797917f206d504e
 F src/os.h 6e446a17cbeb6c2ce470683a0bb8d9c63abe8607
@ -50,24 +50,24 @@ F src/random.c eff68e3f257e05e81eae6c4d50a51eb88beb4ff3
 F src/select.c 7d77a8bed7eeac23216d42fc1be006fb4352fcdc
 F src/shell.c 0c4662e13bfbfd3d13b066c5859cc97ad2f95d21
 F src/sqlite.h.in 75b6eb9eeff3e84052444584b5ad4f0d9a81b8ac
-F src/sqliteInt.h a7b3f10c5e7231abee9ef12ee2d986554ad073df
+F src/sqliteInt.h 4b45892cb082f4883efb58c5e13328c42cbc7642
 F src/table.c af14284fa36c8d41f6829e3f2819dce07d3e2de2
 F src/tclsqlite.c fbf0fac73624ae246551a6c671f1de0235b5faa1
 F src/test1.c e5ba63a9a36fe34f48e3363887984c4d71dbf066
 F src/test2.c 6195a1ca2c8d0d2d93644e86da3289b403486872
 F src/test3.c 5e4a6d596f982f6f47a5f9f75ede9b4a3b739968
 F src/test4.c b3fab9aea7a8940a8a7386ce1c7e2157b09bd296
-F src/test5.c c92dca7028b19b9c8319d55e0a5037fc183640a6
+F src/test5.c 9a1f15133f6955f067c5246e564723b5f23ff221
 F src/tokenize.c e7536dd31205d5afb76c1bdc832dea009c7a3847
 F src/trigger.c 11afe9abfba13a2ba142944c797c952e162d117f
 F src/update.c 1a5e9182596f3ea8c7a141e308a3d2a7e5689fee
-F src/utf.c c27c4f1120f7aaef00cd6942b3d9e3f4ca4fe0e4
+F src/utf.c 537e1c98cddc623628d44497ec02c2246cf66dea
 F src/util.c 5cbeb452da09cfc7248de9948c15b14d840723f7
 F src/vacuum.c c134702e023db8778e6be59ac0ea7b02315b5476
-F src/vdbe.c 2944326a99869c71698f634d6ace9e9be56d9180
+F src/vdbe.c 91e6663c690f5208fadca0bd06b4878aed61f239
 F src/vdbe.h 391d5642a83af686f35c228fcd36cb4456d68f44
-F src/vdbeInt.h 8ed2272e97bef20c5302c3b2cb4f900e8b5e2642
-F src/vdbeaux.c 2dd437063e9a0769ce453f7ce94407934f56e2f8
+F src/vdbeInt.h f40e8048d644c8389cda16f46479376f763d56e6
+F src/vdbeaux.c 8e993bfd0f943163548ce3e09797ce5503d2366f
 F src/where.c efe5d25fe18cd7381722457898cd863e84097a0c
 F test/all.test 569a92a8ee88f5300c057cc4a8f50fbbc69a3242
 F test/attach.test cb9b884344e6cfa5e165965d5b1adea679a24c83
@ -195,7 +195,7 @@ F www/sqlite.tcl 3c83b08cf9f18aa2d69453ff441a36c40e431604
 F www/tclsqlite.tcl b9271d44dcf147a93c98f8ecf28c927307abd6da
 F www/vdbe.tcl 9b9095d4495f37697fd1935d10e14c6015e80aa1
 F www/whentouse.tcl a8335bce47cc2fddb07f19052cb0cb4d9129a8e4
-P acb65297b69c531813287166175fa7864c900fe6
-R b096a079434ae3eb2f44e598006a9ba9
-U drh
-Z 0fc6dbec68937b17866f5fe0a96973f8
+P 8069caca82bc4d40d8ac95bafdd91a18a70ab1e0
+R f44ee6ced05cfe974110947781c41eee
+U danielk1977
+Z 35da3b16c198243de8a20489bd428c37
--- a/manifest.uuid
+++ b/manifest.uuid
@ -1 +1 @@
-8069caca82bc4d40d8ac95bafdd91a18a70ab1e0
+c4a8246864eee7cb993ab7b703324d92c284d72a
--- a/src/main.c
+++ b/src/main.c
@ -14,7 +14,7 @@
 ** other files are for internal use by SQLite and should not be
 ** accessed by users of the library.
 **
-** $Id: main.c,v 1.181 2004/05/21 11:39:05 danielk1977 Exp $
+** $Id: main.c,v 1.182 2004/05/22 03:05:34 danielk1977 Exp $
 */
 #include "sqliteInt.h"
 #include "os.h"
@ -1151,7 +1151,7 @@ int sqlite3_prepare16(
  char const *zTail8 = 0;
  int rc;

-  zSql8 = sqlite3utf16to8(zSql, nBytes);
+  zSql8 = sqlite3utf16to8(zSql, nBytes, SQLITE3_BIGENDIAN);
  if( !zSql8 ){
    sqlite3Error(db, SQLITE_NOMEM, 0);
    return SQLITE_NOMEM;
@ -1197,6 +1197,7 @@ static int openDatabase(
  db->magic = SQLITE_MAGIC_BUSY;
  db->nDb = 2;
  db->aDb = db->aDbStatic;
+  db->enc = def_enc;
  /* db->flags |= SQLITE_ShortColNames; */
  sqlite3HashInit(&db->aFunc, SQLITE_HASH_STRING, 1);
  sqlite3HashInit(&db->aCollSeq, SQLITE_HASH_STRING, 0);
@ -1252,6 +1253,7 @@ int sqlite3_open_new(
  const char **options
 ){
  return openDatabase(zFilename, ppDb, options, TEXT_Utf8);
+  /* return openDatabase(zFilename, ppDb, options, TEXT_Utf16le); */
 }

 sqlite *sqlite3_open(const char *zFilename, int mode, char **pzErrMsg){
@ -1280,7 +1282,7 @@ int sqlite3_open16(

  assert( ppDb );

-  zFilename8 = sqlite3utf16to8(zFilename, -1);
+  zFilename8 = sqlite3utf16to8(zFilename, -1, SQLITE3_BIGENDIAN);
  if( !zFilename8 ){
    *ppDb = 0;
    return SQLITE_NOMEM;
@ -1337,7 +1339,7 @@ int sqlite3_open16(const void *filename, sqlite3 **pDb, const char **options){
  int rc;
  char * filename8;

-  filename8 = sqlite3utf16to8(filename, -1);
+  filename8 = sqlite3utf16to8(filename, -1, SQLITE3_BIGENDIAN);
  if( !filename8 ){
    return SQLITE_NOMEM;
  }
--- a/src/sqliteInt.h
+++ b/src/sqliteInt.h
@ -11,7 +11,7 @@
 *************************************************************************
 ** Internal interface definitions for SQLite.
 **
-** @(#) $Id: sqliteInt.h,v 1.244 2004/05/21 10:08:54 danielk1977 Exp $
+** @(#) $Id: sqliteInt.h,v 1.245 2004/05/22 03:05:34 danielk1977 Exp $
 */
 #include "config.h"
 #include "sqlite.h"
@ -282,7 +282,6 @@ struct Db {
  u16 flags;           /* Flags associated with this database */
  void *pAux;          /* Auxiliary data.  Usually NULL */
  void (*xFreeAux)(void*);  /* Routine to free pAux */
-  u8 textEnc;          /* Text encoding for this database. */
 };

 /*
@ -415,6 +414,7 @@ struct sqlite {
  int errCode;                  /* Most recent error code (SQLITE_*) */
  char *zErrMsg;                /* Most recent error message (UTF-8 encoded) */
  void *zErrMsg16;              /* Most recent error message (UTF-16 encoded) */
+  u8 enc;                       /* Text encoding for this database. */
 };

 /*
@ -652,6 +652,7 @@ struct FKey {
 ** otherwise be equal, then return a result as if the second key larger.
 */
 struct KeyInfo {
+  u8 enc;             /* Text encoding - one of the TEXT_Utf* values */
  u8 incrKey;         /* Increase 2nd key by epsilon before comparison */
  int nField;         /* Number of entries in aColl[] */
  u8 *aSortOrder;     /* If defined an aSortOrder[i] is true, sort DESC */
@ -1341,7 +1342,7 @@ char *sqlite3_snprintf(int,char*,const char*,...);
 int sqlite3GetInt32(const char *, int*);
 int sqlite3GetInt64(const char *, i64*);
 int sqlite3FitsIn64Bits(const char *);
-unsigned char *sqlite3utf16to8(const void *pData, int N);
+unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian);
 void *sqlite3utf8to16be(const unsigned char *pIn, int N);
 void *sqlite3utf8to16le(const unsigned char *pIn, int N);
 void sqlite3utf16to16le(void *pData, int N);
@ -1361,4 +1362,4 @@ int sqlite3IndexAffinityOk(Expr *pExpr, char idx_affinity);
 char sqlite3ExprAffinity(Expr *pExpr);
 int sqlite3atoi64(const char*, i64*);
 void sqlite3Error(sqlite *, int, const char*,...);
-
+int sqlite3utfTranslate(const void *, int , u8 , void **, int *, u8);
--- a/src/test5.c
+++ b/src/test5.c
@ -15,9 +15,10 @@
 ** is used for testing the SQLite routines for converting between
 ** the various supported unicode encodings.
 **
-** $Id: test5.c,v 1.4 2004/05/19 10:34:53 danielk1977 Exp $
+** $Id: test5.c,v 1.5 2004/05/22 03:05:34 danielk1977 Exp $
 */
 #include "sqliteInt.h"
+#include "os.h"         /* to get SQLITE3_BIGENDIAN */
 #include "tcl.h"
 #include <stdlib.h>
 #include <string.h>
@ -166,7 +167,7 @@ static int sqlite_utf16to8(
  }

  in = Tcl_GetByteArrayFromObj(objv[1], 0);
-  out = sqlite3utf16to8(in, -1);
+  out = sqlite3utf16to8(in, -1, SQLITE3_BIGENDIAN);
  res = Tcl_NewByteArrayObj(out, strlen(out)+1);
  sqliteFree(out);

--- a/src/utf.c
+++ b/src/utf.c
@ -12,7 +12,7 @@
 ** This file contains routines used to translate between UTF-8, 
 ** UTF-16, UTF-16BE, and UTF-16LE.
 **
-** $Id: utf.c,v 1.6 2004/05/20 11:00:52 danielk1977 Exp $
+** $Id: utf.c,v 1.7 2004/05/22 03:05:34 danielk1977 Exp $
 **
 ** Notes on UTF-8:
 **
@ -53,6 +53,7 @@
 #include <assert.h>
 #include <unistd.h>
 #include "sqliteInt.h"
+#include "os.h"

 typedef struct UtfString UtfString;
 struct UtfString {
@ -92,13 +93,13 @@ struct UtfString {
 /*
 ** Read the BOM from the start of *pStr, if one is present. Return zero
 ** for little-endian, non-zero for big-endian. If no BOM is present, return
-** the machines native byte order.
+** the value of the parameter "big_endian".
 **
 ** Return values:
 **     1 -> big-endian string
 **     0 -> little-endian string
 */
-static int readUtf16Bom(UtfString *pStr){
+static int readUtf16Bom(UtfString *pStr, int big_endian){
  /* The BOM must be the first thing read from the string */
  assert( pStr->c==0 );

@ -121,7 +122,7 @@ static int readUtf16Bom(UtfString *pStr){
    }
  }

-  return SQLITE3_NATIVE_BIGENDIAN;
+  return big_endian;
 }


@ -375,8 +376,10 @@ int sqlite3utf16ByteLen(const void *pZ, int nChar){
    str.c = 0;
    str.n = -1;

-    /* Check for a BOM */
-    big_endian = readUtf16Bom(&str);
+    /* Check for a BOM. We just ignore it if there is one, it's only read
+    ** so that it is not counted as a character. 
+    */
+    big_endian = readUtf16Bom(&str, 0);
    ret = 0-str.c;

    while( code!=0 && nRead<nChar ){
@ -400,10 +403,9 @@ int sqlite3utf16ByteLen(const void *pZ, int nChar){
 **
 ** The returned UTF-8 string is always \000 terminated.
 */
-unsigned char *sqlite3utf16to8(const void *pData, int N){
+unsigned char *sqlite3utf16to8(const void *pData, int N, int big_endian){
  UtfString in;
  UtfString out;
-  int big_endian;

  out.pZ = 0;

@ -426,7 +428,7 @@ unsigned char *sqlite3utf16to8(const void *pData, int N){
  }
  out.c = 0;

-  big_endian = readUtf16Bom(&in);
+  big_endian = readUtf16Bom(&in, big_endian);
  while( in.c<in.n ){
    writeUtf8(&out, readUtf16(&in, big_endian));
  }
@ -503,7 +505,7 @@ static void utf16to16(void *pData, int N, int big_endian){
    inout.n = sqlite3utf16ByteLen(inout.pZ, -1);
  }

-  if( readUtf16Bom(&inout)!=big_endian ){
+  if( readUtf16Bom(&inout, SQLITE3_BIGENDIAN)!=big_endian ){
    /* swab(&inout.pZ[inout.c], inout.pZ, inout.n-inout.c); */
    int i;
    for(i=0; i<(inout.n-inout.c); i += 2){
@ -554,6 +556,39 @@ void sqlite3utf16to16be(void *pData, int N){
  utf16to16(pData, N, 1);
 }

+/*
+** This function is used to translate between UTF-8 and UTF-16. The
+** result is returned in dynamically allocated memory.
+*/
+int sqlite3utfTranslate(
+  const void *zData,
+  int nData,
+  u8 enc1,
+  void **zOut,
+  int *nOut,
+  u8 enc2
+){
+  assert( enc1==TEXT_Utf8 || enc1==TEXT_Utf16le || enc1==TEXT_Utf16be );
+  assert( enc2==TEXT_Utf8 || enc2==TEXT_Utf16le || enc2==TEXT_Utf16be );
+  assert( 
+    (enc1==TEXT_Utf8 && (enc2==TEXT_Utf16le || enc2==TEXT_Utf16be)) ||
+    (enc2==TEXT_Utf8 && (enc1==TEXT_Utf16le || enc1==TEXT_Utf16be))
+  );

-
+  if( enc1==TEXT_Utf8 ){
+    if( enc2==TEXT_Utf16le ){
+      *zOut = sqlite3utf8to16le(zData, nData);
+    }else{
+      *zOut = sqlite3utf8to16be(zData, nData);
+    }
+    if( !(*zOut) ) return SQLITE_NOMEM;
+    *nOut = sqlite3utf16ByteLen(*zOut, -1)+2;
+  }else{
+    *zOut = sqlite3utf16to8(zData, nData, enc1==TEXT_Utf16be);
+    if( !(*zOut) ) return SQLITE_NOMEM;
+    *nOut = strlen(*zOut)+1;
+  }
+  return SQLITE_OK;
+}
+ 

--- a/src/vdbe.c
+++ b/src/vdbe.c
@ -43,7 +43,7 @@
 ** in this file for details.  If in doubt, do not deviate from existing
 ** commenting and indentation practices when changing or adding code.
 **
-** $Id: vdbe.c,v 1.315 2004/05/21 13:39:51 drh Exp $
+** $Id: vdbe.c,v 1.316 2004/05/22 03:05:34 danielk1977 Exp $
 */
 #include "sqliteInt.h"
 #include "os.h"
@ -69,6 +69,32 @@ int sqlite3_search_count = 0;
 */
 int sqlite3_interrupt_count = 0;

+/*
+** NulTermify
+** Stringify
+** Integerify
+** Realify
+** SetEncoding
+** Release
+*/
+struct MemRecord {
+  char *zData;    /* Serialized record */
+  int nField;     /* Number of fields in the header */
+  int nHeader;    /* Number of bytes in the entire header */
+  u64 *aType;     /* Type values for all entries in the record */
+};
+typedef struct MemRecord MemRecord;
+
+/*
+** Transform the value stored in pMem, which must be a blob into a
+** MemRecord. An Mem cell used to store a MemRecord works as follows:
+**
+** Mem.z points at a MemRecord struct
+*/
+static int Recordify(Mem *pMem){
+  return 0;
+}
+
 #define NulTermify(P) if(((P)->flags & MEM_Str)==0){hardStringify(P);} \
                      else if(((P)->flags & MEM_Term)==0){hardNulTermify(P);}
 static int hardNulTermify(Mem *pStack){
@ -179,88 +205,155 @@ static void hardRealify(Mem *pStack){
  pStack->flags |= MEM_Real;
 }

+/*
+** Parmameter "flags" is the value of the flags for a string Mem object.
+** Return one of TEXT_Utf8, TEXT_Utf16le or TEXT_Utf16be, depending
+** on the encoding indicated by the flags value.
+*/
+static u8 flagsToEnc(int flags){
+  if( flags&MEM_Utf8 ){
+    assert( !(flags&(MEM_Utf16be|MEM_Utf16le)) );
+    return TEXT_Utf8;
+  }
+  if( flags&MEM_Utf16le ){
+    assert( !(flags&(MEM_Utf8|MEM_Utf16be)) );
+    return TEXT_Utf16le;
+  }
+  assert( flags&MEM_Utf16be );
+  assert( !(flags&(MEM_Utf8|MEM_Utf16le)) );
+  return TEXT_Utf16be;
+}
+
+/*
+** Parameter "enc" is one of TEXT_Utf8, TEXT_Utf16le or TEXT_Utf16be.
+** Return the corresponding MEM_Utf* value.
+*/
+static int encToFlags(u8 enc){
+  switch( enc ){
+    case TEXT_Utf8: return MEM_Utf8;
+    case TEXT_Utf16be: return MEM_Utf16be;
+    case TEXT_Utf16le: return MEM_Utf16le;
+  }
+  assert(0);
+}
+
 /*
 ** If pMem is a string object, this routine sets the encoding of the string
 ** (to one of UTF-8 or UTF16) and whether or not the string is
 ** nul-terminated. If pMem is not a string object, then this routine is
 ** a no-op.
 **
-** If argument "utf16" is true, then this routine will attempt to convert
-** the string to native byte order UTF-16 encoding. Otherwise, the
-** conversion is to UTF-8 encoding. If the "term" argument is true, then a
-** nul terminator is added to the string if it does not already have one.
-**
-**
+** The second argument, "flags" consists of one of MEM_Utf8, MEM_Utf16le
+** or MEM_Utf16be, possible ORed with MEM_Term. If necessary this function 
+** manipulates the value stored by pMem so that it matches the flags passed
+** in "flags".
 **
 ** SQLITE_OK is returned if the conversion is successful (or not required).
 ** SQLITE_NOMEM may be returned if a malloc() fails during conversion
 ** between formats.
 */
-static int SetEncoding(Mem *pMem, int flags){
-  int f;
-  if( !(pMem->flags&MEM_Str) ){
+int SetEncoding(Mem *pMem, int flags){
+  u8 enc1;    /* Current string encoding (TEXT_Utf* value) */
+  u8 enc2;    /* Required string encoding (TEXT_Utf* value) */
+
+  /* If this is not a string, do nothing. */
+  if( !(pMem->flags&MEM_Str) || pMem->flags&MEM_Int || pMem->flags&MEM_Real ){
    return SQLITE_OK;
  }

-  f = (pMem->flags)&(MEM_Utf8|MEM_Utf16le|MEM_Utf16be|MEM_Term);
-  assert( flags==(flags&(MEM_Utf8|MEM_Utf16le|MEM_Utf16be|MEM_Term)));
-  if( f==flags ){
-    return SQLITE_OK;
-  }
+  enc1 = flagsToEnc(pMem->flags);
+  enc2 = flagsToEnc(flags);

-  if( (SQLITE3_BIGENDIAN    && (f&MEM_Utf16le)) ||
-      (SQLITE3_LITTLEENDIAN && (f&MEM_Utf16be)) ){
-    int i;
-    for(i=0; i<pMem->n; i+=2){
-      char c = pMem->z[i];
-      pMem->z[i] = pMem->z[i+1];
-      pMem->z[i+1] = c;
+  if( enc1!=enc2 ){
+    /* If the current encoding does not match the desired encoding, then
+    ** we will need to do some translation between encodings.
+    */
+    char *z;
+    int n;
+    int rc = sqlite3utfTranslate(pMem->z, pMem->n, enc1, (void **)&z, &n, enc2);
+    if( rc!=SQLITE_OK ){
+      return rc;
    }
-  }

-  if( (flags&MEM_Utf8) && (f&(MEM_Utf16le|MEM_Utf16be)) ){
-    char *z = sqlite3utf16to8(pMem->z, pMem->n); 
-    if( !z ){
-      return SQLITE_NOMEM;
-    }
-    Release(pMem);
+    /* Result of sqlite3utfTranslate is currently always dynamically
+    ** allocated and nul terminated. This might be altered as a performance
+    ** enhancement later.
+    */
    pMem->z = z;
-    pMem->n = strlen(z)+1;
-    pMem->flags = (MEM_Utf8|MEM_Dyn|MEM_Str|MEM_Term);
-    return SQLITE_OK;
+    pMem->n = n;
+    pMem->flags = (MEM_Str | MEM_Dyn | MEM_Term | flags);
  }

-  if( (flags&MEM_Utf16le) && (f&MEM_Utf8) ){
-    char *z = sqlite3utf8to16le(pMem->z, pMem->n); 
-    if( !z ){
-      return SQLITE_NOMEM;
+  if( (flags&MEM_Term) && !(pMem->flags&MEM_Term) ){
+    /* If we did not do any translation, but currently the string is
+    ** not nul terminated (and is required to be), then we add the
+    ** nul terminator now. We never have to do this if we translated
+    ** the encoding of the string, as the translation functions return
+    ** nul terminated values.
+    */
+    int f = pMem->flags;
+    int nulTermLen = 2;     /* The number of 0x00 bytes to append */
+    if( enc2==MEM_Utf8 ){
+      nulTermLen = 1;
    }
-    Release(pMem);
-    pMem->z = z;
-    pMem->n = sqlite3utf16ByteLen(z, -1) + 2;
-    pMem->flags = (MEM_Utf16le|MEM_Dyn|MEM_Str|MEM_Term);
-    return SQLITE_OK;
-  }

-  if( (flags&MEM_Utf16be) && (f&MEM_Utf8) ){
-    char *z = sqlite3utf8to16be(pMem->z, pMem->n); 
-    if( !z ){
-      return SQLITE_NOMEM;
+    if( pMem->n+nulTermLen<=NBFS ){
+      /* If the string plus the nul terminator will fit in the Mem.zShort
+      ** buffer, and it is not already stored there, copy it there.
+      */
+      if( !(f&MEM_Short) ){
+        memcpy(pMem->z, pMem->zShort, pMem->n);
+        if( f&MEM_Dyn ){
+          sqliteFree(pMem->z);
+        }
+        pMem->z = pMem->zShort;
+        pMem->flags &= ~(MEM_Static|MEM_Ephem|MEM_Dyn);
+        pMem->flags |= MEM_Short;
+      }
+    }else{
+      /* Otherwise we have to malloc for memory. If the string is already
+      ** dynamic, use sqliteRealloc(). Otherwise sqliteMalloc() enough
+      ** space for the string and the nul terminator, and copy the string
+      ** data there.
+      */
+      if( f&MEM_Dyn ){
+        pMem->z = (char *)sqliteRealloc(pMem->z, pMem->n+nulTermLen);
+        if( !pMem->z ){
+          return SQLITE_NOMEM;
+        }
+      }else{
+        char *z = (char *)sqliteMalloc(pMem->n+nulTermLen);
+        memcpy(z, pMem->z, pMem->n);
+        pMem->z = z;
+        pMem->flags &= ~(MEM_Static|MEM_Ephem|MEM_Short);
+        pMem->flags |= MEM_Dyn;
+      }
    }
-    Release(pMem);
-    pMem->z = z;
-    pMem->n = sqlite3utf16ByteLen(z, -1) + 2;
-    pMem->flags = (MEM_Utf16be|MEM_Dyn|MEM_Str|MEM_Term);
-    return SQLITE_OK;
-  }

-  if( (flags&MEM_Term) && !(f&&MEM_Term) ){
-    NulTermify(pMem);
+    /* pMem->z now points at the string data, with enough space at the end
+    ** to insert the nul nul terminator. pMem->n has not yet been updated.
+    */
+    memcpy(&pMem->z[pMem->n], "\0\0", nulTermLen);
+    pMem->n += nulTermLen;
+    pMem->flags |= MEM_Term;
  }
-
  return SQLITE_OK;
 }

+int sqlite3VdbeSetEncoding(Mem *pMem, u8 enc){
+  switch( enc ){
+    case TEXT_Utf8:
+      return SetEncoding(pMem, MEM_Utf8);
+    case TEXT_Utf16le:
+      return SetEncoding(pMem, MEM_Utf16le);
+    case TEXT_Utf16be:
+      return SetEncoding(pMem, MEM_Utf16be);
+    default:
+      assert(0);
+  }
+  return SQLITE_INTERNAL;
+}
+
 /*
 ** Convert the given stack entity into a string that has been obtained
 ** from sqliteMalloc().  This is different from Stringify() above in that
@ -840,11 +933,11 @@ static void applyAffinity(Mem *pRec, char affinity){
  }
 }

+#ifndef NDEBUG
 /*
 ** Write a nice string representation of the contents of cell pMem
 ** into buffer zBuf, length nBuf.
 */
-#ifndef NDEBUG
 void prettyPrintMem(Mem *pMem, char *zBuf, int nBuf){
  char *zCsr = zBuf;
  int f = pMem->flags;
@ -865,7 +958,8 @@ void prettyPrintMem(Mem *pMem, char *zBuf, int nBuf){
      c = 's';
    }

-    zCsr += sprintf(zCsr, "%c[", c);
+    zCsr += sprintf(zCsr, "%c", c);
+    zCsr += sprintf(zCsr, "%d[", pMem->n);
    for(i=0; i<16 && i<pMem->n; i++){
      zCsr += sprintf(zCsr, "%02X ", ((int)pMem->z[i] & 0xFF));
    }
@ -876,10 +970,47 @@ void prettyPrintMem(Mem *pMem, char *zBuf, int nBuf){
    }

    zCsr += sprintf(zCsr, "]");
+    *zCsr = '\0';
+  }else if( f & MEM_Str ){
+    int j, k;
+    zBuf[0] = ' ';
+    if( f & MEM_Dyn ){
+      zBuf[1] = 'z';
+      assert( (f & (MEM_Static|MEM_Ephem))==0 );
+    }else if( f & MEM_Static ){
+      zBuf[1] = 't';
+      assert( (f & (MEM_Dyn|MEM_Ephem))==0 );
+    }else if( f & MEM_Ephem ){
+      zBuf[1] = 'e';
+      assert( (f & (MEM_Static|MEM_Dyn))==0 );
+    }else{
+      zBuf[1] = 's';
+    }
+    k = 2;
+    k += sprintf(&zBuf[k], "%d", pMem->n);
+    zBuf[k++] = '[';
+    for(j=0; j<15 && j<pMem->n; j++){
+      u8 c = pMem->z[j];
+      if( c==0 && j==pMem->n-1 ) break;
+/*
+            zBuf[k++] = "0123456789ABCDEF"[c>>4];
+            zBuf[k++] = "0123456789ABCDEF"[c&0xf];
+*/
+      if( c>=0x20 && c<0x7f ){
+        zBuf[k++] = c;
+      }else{
+        zBuf[k++] = '.';
+      }
+    }
+    zBuf[k++] = ']';
+    zBuf[k++] = 0;
  }
-
-  *zCsr = '\0';
 }
+
+/* Temporary - this is useful in conjunction with prettyPrintMem whilst
+** debugging. 
+*/
+char zGdbBuf[100];
 #endif

 /*
@ -1264,37 +1395,13 @@ case OP_Variable: {
  Mem *pVar;
  assert( j>=0 && j<p->nVar );

-  /* If we need to translate between text encodings, do it now. If this is
-  ** required, then put the new string in p->apVar. This way, if the
-  ** variable is used again, even after the virtual machine is reset, the
-  ** conversion won't have to be done again.
-  **
-  ** FIX ME: This is where we need to support databases that use other than
-  ** UTF-8 on disk.
+  /* Ensure the variable string (if it is a string) is UTF-8 encoded and
+  ** nul terminated. Do the transformation on the variable before it 
+  ** is copied onto the stack, in case it is used again before this VDBE is
+  ** finalized.
  */
  pVar = &p->apVar[j];
-  if( pVar->flags&MEM_Str && !(pVar->flags&MEM_Utf8) ){
-    char *zUtf8;
-    assert( pVar->flags&(MEM_Utf16le|MEM_Utf16be) );
-    zUtf8 = sqlite3utf16to8(pVar->z, pVar->n);
-    if( !zUtf8 ){
-      goto no_mem;
-    }
-    Release(pVar);
-    pVar->z = zUtf8;
-    pVar->n = strlen(zUtf8)+1;
-    pVar->flags = MEM_Str|MEM_Dyn|MEM_Utf8|MEM_Term;
-  }
-
-  /* Ensure that the variable value is nul terminated. Again, do this in
-  ** place.
-  **
-  ** FIX ME: The rest of the vdbe will soon understand MEM_Term, making
-  ** this step unnecessary.
-  */
-  if( pVar->flags&MEM_Str ){
-    NulTermify(pVar);
-  }
+  SetEncoding(pVar, MEM_Utf8|MEM_Term);

  /* Copy the value in pVar to the top of the stack. If pVar is a string or
  ** a blob just store a pointer to the same memory, do not make a copy.
@ -1531,7 +1638,7 @@ case OP_Concat: {
  }
  pTos++;
  pTos->n = nByte;
-  pTos->flags = MEM_Str|MEM_Dyn|MEM_Utf8;
+  pTos->flags = MEM_Str|MEM_Dyn|MEM_Utf8|MEM_Term;
  pTos->z = zNew;
  break;
 }
@ -1693,6 +1800,9 @@ case OP_Function: {
  popStack(&pTos, n);
  pTos++;
  *pTos = ctx.s;
+  if( pTos->flags & MEM_Str ){
+    pTos->flags |= MEM_Term;
+  }
  if( pTos->flags & MEM_Short ){
    pTos->z = pTos->zShort;
  }
@ -2311,7 +2421,11 @@ case OP_Column: {
    }
    off += off2;
    
-    sqlite3VdbeSerialGet(&zRec[off], colType, pTos);
+    sqlite3VdbeSerialGet(&zRec[off], colType, pTos, p->db->enc);
+    rc = SetEncoding(pTos, MEM_Utf8|MEM_Term);
+    if( rc!=SQLITE_OK ){
+      goto abort_due_to_error;
+    }
    break;
  }

@ -2422,7 +2536,11 @@ case OP_Column: {
    getBtreeMem(pCrsr, offset, len, pC->keyAsData, &sMem);
    zData = sMem.z;
  }
-  sqlite3VdbeSerialGet(zData, pC->aType[p2], pTos);
+  sqlite3VdbeSerialGet(zData, pC->aType[p2], pTos, p->db->enc);
+  rc = SetEncoding(pTos, MEM_Utf8|MEM_Term);
+  if( rc!=SQLITE_OK ){
+    goto abort_due_to_error;
+  }

  Release(&sMem);
  break;
@ -2491,6 +2609,7 @@ case OP_MakeRecord: {
    if( zAffinity ){
      applyAffinity(pRec, zAffinity[pRec-pData0]);
    }
+    SetEncoding(pRec, encToFlags(p->db->enc));
    serial_type = sqlite3VdbeSerialType(pRec);
    nBytes += sqlite3VdbeSerialTypeLen(serial_type);
    nBytes += sqlite3VarintLen(serial_type);
@ -2614,6 +2733,7 @@ case OP_MakeIdxKey: {
    if( pRec->flags&MEM_Null ){
      containsNull = 1;
    }
+    SetEncoding(pRec, encToFlags(p->db->enc));
    serial_type = sqlite3VdbeSerialType(pRec);
    nByte += sqlite3VarintLen(serial_type);
    nByte += sqlite3VdbeSerialTypeLen(serial_type);
@ -2645,7 +2765,8 @@ case OP_MakeIdxKey: {
  
  /* Build the key in the buffer pointed to by zKey. */
  for(pRec=pData0; pRec<=pTos; pRec++){
-    offset += sqlite3PutVarint(&zKey[offset], sqlite3VdbeSerialType(pRec));
+    u64 serial_type = sqlite3VdbeSerialType(pRec);
+    offset += sqlite3PutVarint(&zKey[offset], serial_type);
    offset += sqlite3VdbeSerialPut(&zKey[offset], pRec);
  }
  if( addRowid ){
@ -2968,6 +3089,7 @@ case OP_OpenWrite: {
    pCur->pKeyInfo = (KeyInfo*)pOp->p3;
    if( pCur->pKeyInfo ){
      pCur->pIncrKey = &pCur->pKeyInfo->incrKey;
+      pCur->pKeyInfo->enc = p->db->enc;
    }else{
      pCur->pIncrKey = &pCur->bogusIncrKey;
    }
@ -3051,6 +3173,7 @@ case OP_OpenTemp: {
        rc = sqlite3BtreeCursor(pCx->pBt, pgno, 1, sqlite3VdbeKeyCompare,
            pOp->p3, &pCx->pCursor);
        pCx->pKeyInfo = (KeyInfo*)pOp->p3;
+        pCx->pKeyInfo->enc = p->db->enc;
        pCx->pIncrKey = &pCx->pKeyInfo->incrKey;
      }
    }else{
@ -3824,7 +3947,8 @@ case OP_IdxColumn: {
  }

  pTos++;
-  sqlite3VdbeSerialGet(&zData[len], serial_type, pTos);
+  sqlite3VdbeSerialGet(&zData[len], serial_type, pTos, p->db->enc);
+  SetEncoding(pTos, MEM_Utf8|MEM_Term);
  if( freeZData ){
    sqliteFree(zData);
  }
@ -4585,6 +4709,7 @@ case OP_SortPut: {
 case OP_Sort: {
  int i;
  KeyInfo *pKeyInfo = (KeyInfo*)pOp->p3;
+  pKeyInfo->enc = p->db->enc;
  Sorter *pElem;
  Sorter *apSorter[NSORT];
  for(i=0; i<NSORT; i++){
@ -5237,38 +5362,6 @@ default: {
          fprintf(p->trace, " i:%lld", pTos[i].i);
        }else if( pTos[i].flags & MEM_Real ){
          fprintf(p->trace, " r:%g", pTos[i].r);
-        }else if( pTos[i].flags & MEM_Str ){
-          int j, k;
-          char zBuf[100];
-          zBuf[0] = ' ';
-          if( pTos[i].flags & MEM_Dyn ){
-            zBuf[1] = 'z';
-            assert( (pTos[i].flags & (MEM_Static|MEM_Ephem))==0 );
-          }else if( pTos[i].flags & MEM_Static ){
-            zBuf[1] = 't';
-            assert( (pTos[i].flags & (MEM_Dyn|MEM_Ephem))==0 );
-          }else if( pTos[i].flags & MEM_Ephem ){
-            zBuf[1] = 'e';
-            assert( (pTos[i].flags & (MEM_Static|MEM_Dyn))==0 );
-          }else{
-            zBuf[1] = 's';
-          }
-          zBuf[2] = '[';
-          k = 3;
-          for(j=0; j<15 && j<pTos[i].n; j++){
-            u8 c = pTos[i].z[j];
-            if( c==0 && j==pTos[i].n-1 ) break;
-            zBuf[k++] = "0123456789ABCDEF"[c>>4];
-            zBuf[k++] = "0123456789ABCDEF"[c&0xf];
-            if( c>=0x20 && c<0x7f ){
-              zBuf[k++] = c;
-            }else{
-              zBuf[k++] = '.';
-            }
-          }
-          zBuf[k++] = ']';
-          zBuf[k++] = 0;
-          fprintf(p->trace, "%s", zBuf);
        }else{
          char zBuf[100];
          prettyPrintMem(pTos, zBuf, 100);
--- a/src/vdbeInt.h
+++ b/src/vdbeInt.h
@ -147,23 +147,24 @@ typedef struct Mem Mem;
 #define MEM_Int       0x0004   /* Value is an integer */
 #define MEM_Real      0x0008   /* Value is a real number */
 #define MEM_Blob      0x0010   /* Value is a BLOB */
+#define MEM_Struct    0x0020   /* Value is some kind of struct */

-#define MEM_Term      0x1000   /* String has a nul terminator character */
+#define MEM_Utf8      0x0040   /* String uses UTF-8 encoding */
+#define MEM_Utf16be   0x0080   /* String uses UTF-16 big-endian */
+#define MEM_Utf16le   0x0100   /* String uses UTF-16 little-endian */
+#define MEM_Term      0x0200   /* String has a nul terminator character */

-#define MEM_Utf8      0x0020   /* String uses UTF-8 encoding */
-#define MEM_Utf16be   0x0040   /* String uses UTF-16 big-endian */
-#define MEM_Utf16le   0x0080   /* String uses UTF-16 little-endian */
+#define MEM_Dyn       0x0400   /* Need to call sqliteFree() on Mem.z */
+#define MEM_Static    0x0800   /* Mem.z points to a static string */
+#define MEM_Ephem     0x1000   /* Mem.z points to an ephemeral string */
+#define MEM_Short     0x2000   /* Mem.z points to Mem.zShort */

-#define MEM_Dyn       0x0100   /* Need to call sqliteFree() on Mem.z */
-#define MEM_Static    0x0200   /* Mem.z points to a static string */
-#define MEM_Ephem     0x0400   /* Mem.z points to an ephemeral string */
-#define MEM_Short     0x0800   /* Mem.z points to Mem.zShort */

 /* The following MEM_ value appears only in AggElem.aMem.s.flag fields.
 ** It indicates that the corresponding AggElem.aMem.z points to a
 ** aggregate function context that needs to be finalized.
 */
-#define MEM_AggCtx    0x1000   /* Mem.z points to an agg function context */
+#define MEM_AggCtx    0x4000   /* Mem.z points to an agg function context */

 /*
 ** The "context" argument for a installable function.  A pointer to an
@ -329,9 +330,9 @@ int sqlite3VdbeCursorMoveto(Cursor*);
 void sqlite3VdbePrintOp(FILE*, int, Op*);
 #endif
 int sqlite3VdbeSerialTypeLen(u64);
-u64 sqlite3VdbeSerialType(const Mem *);
-int sqlite3VdbeSerialPut(unsigned char *, const Mem *);
-int sqlite3VdbeSerialGet(const unsigned char *, u64, Mem *);
+u64 sqlite3VdbeSerialType(Mem *);
+int sqlite3VdbeSerialPut(unsigned char *, Mem *);
+int sqlite3VdbeSerialGet(const unsigned char *, u64, Mem *, u8 enc);

 int sqlite2BtreeKeyCompare(BtCursor *, const void *, int, int, int *);
 int sqlite3VdbeIdxKeyCompare(Cursor*, int , const unsigned char*, int*);
@ -341,3 +342,5 @@ int sqlite3VdbeKeyCompare(void*,int,const void*,int, const void*);
 int sqlite3VdbeRowCompare(void*,int,const void*,int, const void*);
 int sqlite3VdbeExec(Vdbe*);
 int sqlite3VdbeList(Vdbe*);
+int sqlite3VdbeSetEncoding(Mem *, u8);
+
--- a/src/vdbeaux.c
+++ b/src/vdbeaux.c
@ -1135,7 +1135,7 @@ int sqlite3_bind_int64(sqlite3_stmt *p, int i, long long int iValue){
    pVar->flags = MEM_Int;
    pVar->i = iValue;
  }
-  return SQLITE_OK;
+  return rc;
 }

 /*
@ -1199,7 +1199,13 @@ int sqlite3_bind_text16(
  int nData, 
  int eCopy
 ){
-  int flags = MEM_Str|MEM_Utf16le|MEM_Utf16be;
+  int flags;
+  
+  if( SQLITE3_BIGENDIAN ){
+    flags = MEM_Str|MEM_Utf16be;
+  }else{
+    flags = MEM_Str|MEM_Utf16le;
+  }

  if( zData ){
    /* If nData is less than zero, measure the length of the string. 
@ -1362,7 +1368,7 @@ int sqlite3VdbeCursorMoveto(Cursor *p){
 /*
 ** Return the serial-type for the value stored in pMem.
 */
-u64 sqlite3VdbeSerialType(const Mem *pMem){
+u64 sqlite3VdbeSerialType(Mem *pMem){
  int flags = pMem->flags;

  if( flags&MEM_Null ){
@ -1380,12 +1386,13 @@ u64 sqlite3VdbeSerialType(const Mem *pMem){
    return 5;
  }
  if( flags&MEM_Str ){
-    /* We assume that the string is NULL-terminated. We don't store the
-    ** NULL-terminator - it is implied by the string storage class.
-    */
+    u64 t;
    assert( pMem->n>0 );
-    assert( pMem->z[pMem->n-1]=='\0' );
-    return (pMem->n*2 + 11); /* (pMem->n-1)*2 + 13 */
+    t = (pMem->n*2) + 13;
+    if( pMem->flags&MEM_Term ){
+      t -= ((pMem->flags&MEM_Utf8)?2:4);
+    }
+    return t;
  }
  if( flags&MEM_Blob ){
    return (pMem->n*2 + 12);
@ -1415,7 +1422,7 @@ int sqlite3VdbeSerialTypeLen(u64 serial_type){
 ** buf. It is assumed that the caller has allocated sufficient space.
 ** Return the number of bytes written.
 */ 
-int sqlite3VdbeSerialPut(unsigned char *buf, const Mem *pMem){
+int sqlite3VdbeSerialPut(unsigned char *buf, Mem *pMem){
  u64 serial_type = sqlite3VdbeSerialType(pMem);
  int len;

@ -1454,7 +1461,12 @@ int sqlite3VdbeSerialPut(unsigned char *buf, const Mem *pMem){
 ** Deserialize the data blob pointed to by buf as serial type serial_type
 ** and store the result in pMem.  Return the number of bytes read.
 */ 
-int sqlite3VdbeSerialGet(const unsigned char *buf, u64 serial_type, Mem *pMem){
+int sqlite3VdbeSerialGet(
+  const unsigned char *buf, 
+  u64 serial_type, 
+  Mem *pMem,
+  u8 enc
+){
  int len;

  assert( serial_type!=0 );
@ -1486,7 +1498,7 @@ int sqlite3VdbeSerialGet(const unsigned char *buf, u64 serial_type, Mem *pMem){
      pMem->r = *(double*)&v;
    }else{
      pMem->flags = MEM_Int;
-      pMem->i = *(int*)&v;
+      pMem->i = *(i64*)&v;
    }
    return len;
  }
@ -1495,8 +1507,19 @@ int sqlite3VdbeSerialGet(const unsigned char *buf, u64 serial_type, Mem *pMem){
  assert( serial_type>=12 );
  len = sqlite3VdbeSerialTypeLen(serial_type);
  if( serial_type&0x01 ){
-    pMem->flags = MEM_Str|MEM_Utf8;
-    pMem->n = len+1;
+    switch( enc ){
+      case TEXT_Utf8:
+        pMem->flags = MEM_Str|MEM_Utf8|MEM_Term;
+        break;
+      case TEXT_Utf16le:
+        pMem->flags = MEM_Str|MEM_Utf16le|MEM_Term;
+        break;
+      case TEXT_Utf16be:
+        pMem->flags = MEM_Str|MEM_Utf16be|MEM_Term;
+        break;
+      assert(0);
+    }
+    pMem->n = len+(enc==TEXT_Utf8?1:2);
  }else{
    pMem->flags = MEM_Blob;
    pMem->n = len;
@ -1516,6 +1539,9 @@ int sqlite3VdbeSerialGet(const unsigned char *buf, u64 serial_type, Mem *pMem){
  memcpy(pMem->z, buf, len); 
  if( pMem->flags&MEM_Str ){
    pMem->z[len] = '\0';
+    if( enc!=TEXT_Utf8 ){
+      pMem->z[len+1] = '\0';
+    }
  }

  return len;
@ -1635,6 +1661,7 @@ int sqlite3VdbeKeyCompare(
  int offset2 = 0;
  int i = 0;
  int rc = 0;
+  u8 enc = pKeyInfo->enc;
  const unsigned char *aKey1 = (const unsigned char *)pKey1;
  const unsigned char *aKey2 = (const unsigned char *)pKey2;
  
@ -1675,8 +1702,8 @@ int sqlite3VdbeKeyCompare(
    ** the file is corrupted.  Then read the value from each key into mem1
    ** and mem2 respectively.
    */
-    offset1 += sqlite3VdbeSerialGet(&aKey1[offset1], serial_type1, &mem1);
-    offset2 += sqlite3VdbeSerialGet(&aKey2[offset2], serial_type2, &mem2);
+    offset1 += sqlite3VdbeSerialGet(&aKey1[offset1], serial_type1, &mem1, enc);
+    offset2 += sqlite3VdbeSerialGet(&aKey2[offset2], serial_type2, &mem2, enc);

    rc = sqlite3MemCompare(&mem1, &mem2, pKeyInfo->aColl[i]);
    if( mem1.flags&MEM_Dyn ){
@ -1734,6 +1761,7 @@ int sqlite3VdbeRowCompare(
  int toffset1 = 0;
  int toffset2 = 0;
  int i;
+  u8 enc = pKeyInfo->enc;
  const unsigned char *aKey1 = (const unsigned char *)pKey1;
  const unsigned char *aKey2 = (const unsigned char *)pKey2;

@ -1764,8 +1792,8 @@ int sqlite3VdbeRowCompare(
    ** the file is corrupted.  Then read the value from each key into mem1
    ** and mem2 respectively.
    */
-    offset1 += sqlite3VdbeSerialGet(&aKey1[offset1], serial_type1, &mem1);
-    offset2 += sqlite3VdbeSerialGet(&aKey2[offset2], serial_type2, &mem2);
+    offset1 += sqlite3VdbeSerialGet(&aKey1[offset1], serial_type1, &mem1, enc);
+    offset2 += sqlite3VdbeSerialGet(&aKey2[offset2], serial_type2, &mem2, enc);

    rc = sqlite3MemCompare(&mem1, &mem2, pKeyInfo->aColl[i]);
    if( mem1.flags&MEM_Dyn ){