Merge the sorter-coalesce-writes branch into the trunk. This improves CREATE INDEX performance on some platforms.

FossilOrigin-Name: e1e9cb08b011e67b767091e42225f22ec862fa64
This commit is contained in:
dan 2012-08-06 19:28:20 +00:00
commit 09ac7ec544
12 changed files with 420 additions and 142 deletions

View File

@ -1,5 +1,5 @@
C Update\sdescription\sstrings\sin\sthe\sVSIX\spackage.
D 2012-08-06T10:51:55.331
C Merge\sthe\ssorter-coalesce-writes\sbranch\sinto\sthe\strunk.\sThis\simproves\sCREATE\sINDEX\sperformance\son\ssome\splatforms.
D 2012-08-06T19:28:20.956
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in abd5c10d21d1395f140d9e50ea999df8fa4d6376
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@ -117,16 +117,16 @@ F sqlite.pc.in 42b7bf0d02e08b9e77734a47798d1a55a9e0716b
F sqlite3.1 6be1ad09113570e1fc8dcaff84c9b0b337db5ffc
F sqlite3.pc.in ae6f59a76e862f5c561eb32a380228a02afc3cad
F src/alter.c 149cc80d9257971b0bff34e58fb2263e01998289
F src/analyze.c a4790912e504c8ddac273445e7aba39bbce50881
F src/analyze.c 7553068d21e32a57fc33ab6b2393fc8c1ba41410
F src/attach.c 577bf5675b0c50495fc28549f2fcbdb1bac71143
F src/auth.c 523da7fb4979469955d822ff9298352d6b31de34
F src/backup.c 5b31b24d6814b11de763debf342c8cd0a15a4910
F src/bitvec.c 26675fe8e431dc555e6f2d0e11e651d172234aa1
F src/btmutex.c 976f45a12e37293e32cae0281b15a21d48a8aaa7
F src/btree.c f0b71054103cb77eb5e782088c16998ec4f06624
F src/btree.h 48a013f8964f12d944d90e4700df47b72dd6d923
F src/btreeInt.h 38a639c0542c29fe8331a221c4aed0cb8686249e
F src/build.c 47c4506afe4bcb4ed1f4b5357582d1cb3402f8ad
F src/btree.c 1d366468b6f30234d76bf1da43e038d6f3ba2c9c
F src/btree.h 4aee02e879211bfcfd3f551769578d2e940ab6c2
F src/btreeInt.h 4e5c2bd0f9b36b2a815a6d84f771a61a65830621
F src/build.c 0f6b40ad6211dcaba6159d0f9a297f0704f22142
F src/callback.c 0cb4228cdcd827dcc5def98fb099edcc9142dbcd
F src/complete.c dc1d136c0feee03c2f7550bafc0d29075e36deac
F src/ctime.c 500d019da966631ad957c37705642be87524463b
@ -180,7 +180,7 @@ F src/select.c a365da6d7a6d7d8a10ad60ca71837ab5e9369466
F src/shell.c 076e1c90d594644f36027c8ecff9a392cf2d3a06
F src/sqlite.h.in 3e8035bc406b1571a5cc8ea46bcc831201676f1a
F src/sqlite3ext.h 6904f4aadf976f95241311fbffb00823075d9477
F src/sqliteInt.h ed41801550b0b8fb8217fcfd2e362118062b30c0
F src/sqliteInt.h c8169801f8bbfdf5873cc6fa45cb5df720c04db4
F src/sqliteLimit.h 164b0e6749d31e0daa1a4589a169d31c0dec7b3d
F src/status.c 35939e7e03abf1b7577ce311f48f682c40de3208
F src/table.c 2cd62736f845d82200acfa1287e33feb3c15d62e
@ -227,7 +227,7 @@ F src/test_superlock.c 2b97936ca127d13962c3605dbc9a4ef269c424cd
F src/test_syscall.c a992d8c80ea91fbf21fb2dd570db40e77dd7e6ae
F src/test_tclvar.c f4dc67d5f780707210d6bb0eb6016a431c04c7fa
F src/test_thread.c e286f2173563f2a1747c24bcda6b9d030bf4f4e4
F src/test_vfs.c da6d0d982b11756c94c1760196355d33d03ff745
F src/test_vfs.c c6260ef238c1142c8f8bd402db02216afd182ae3
F src/test_vfstrace.c 6b28adb2a0e8ecd0f2e3581482e1f658b11b4067
F src/test_wholenumber.c 3d2b9ed1505c40ad5c5ca2ad16ae7a289d6cc251
F src/test_wsd.c 41cadfd9d97fe8e3e4e44f61a4a8ccd6f7ca8fe9
@ -237,14 +237,14 @@ F src/update.c d3076782c887c10e882996550345da9c4c9f9dea
F src/utf.c 890c67dcfcc7a74623c95baac7535aadfe265e84
F src/util.c 0af2e515dc0dabacec931bca39525f6c3f1c5455
F src/vacuum.c 587a52bb8833d7ac15af8916f25437e2575028bd
F src/vdbe.c f5ad3c06dc3fe647097065829c013f3f1b9eadca
F src/vdbe.c 75da79cdcd58481825a06f045bc2f5ea3966eeae
F src/vdbe.h 18f581cac1f4339ec3299f3e0cc6e11aec654cdb
F src/vdbeInt.h 986b6b11a13c517337355009e5438703ba5b0a40
F src/vdbeapi.c 88ea823bbcb4320f5a6607f39cd7c2d3cc4c26b1
F src/vdbeaux.c dce80038c3c41f2680e5ab4dd0f7e0d8b7ff9071
F src/vdbeblob.c 32f2a4899d67f69634ea4dd93e3f651936d732cb
F src/vdbemem.c cb55e84b8e2c15704968ee05f0fae25883299b74
F src/vdbesort.c 628b2bc0cc82cae0e9946f70c5c81986e9fba91f
F src/vdbesort.c 4897215f0a0c4e731aa5ac5fc0317b62a4919e79
F src/vdbetrace.c 8bd5da325fc90f28464335e4cc4ad1407fe30835
F src/vtab.c bb8ea3a26608bb1357538a5d2fc72beba6638998
F src/wal.c 9294df6f96aae5909ae1a9b733fd1e1b4736978b
@ -531,6 +531,7 @@ F test/index.test b5429732b3b983fa810e3ac867d7ca85dae35097
F test/index2.test ee83c6b5e3173a3d7137140d945d9a5d4fdfb9d6
F test/index3.test 423a25c789fc8cc51aaf2a4370bbdde2d9e9eed7
F test/index4.test 2983216eb8c86ee62d9ed7cb206b5cc3331c0026
F test/index5.test edc8c64ca78bee140c21ce3836820fadf47906bb
F test/indexedby.test be501e381b82b2f8ab406309ba7aac46e221f4ad
F test/indexfault.test 31d4ab9a7d2f6e9616933eb079722362a883eb1d
F test/init.test 15c823093fdabbf7b531fe22cf037134d09587a7
@ -1008,7 +1009,7 @@ F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
F tool/win/sqlite.vsix 67d8a99aceb56384a81b3f30d6c71743146d2cc9
P 335e91e599555d9f4e42f90576d1676c381314f4
R eadc6c87ec15f9db6747eb42f404a18d
U mistachkin
Z 9bc32025e8412eea1e17719cbb9188be
P 541e9310a7b88e0b40c6530947803527f28e51de 2e5741f774248abc678b50711c43e38ca30c9091
R 9ea05a2580296dc700acf20c4ae23876
U dan
Z ec36f3ad854c62e55cc3fc27576cdc7b

View File

@ -1 +1 @@
541e9310a7b88e0b40c6530947803527f28e51de
e1e9cb08b011e67b767091e42225f22ec862fa64

View File

@ -176,7 +176,7 @@ static void openStatTable(
"CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols
);
aRoot[i] = pParse->regRoot;
aCreateTbl[i] = 1;
aCreateTbl[i] = OPFLAG_P2ISREG;
}else{
/* The table already exists. If zWhere is not NULL, delete all entries
** associated with the table zWhere. If zWhere is NULL, delete the

View File

@ -5926,7 +5926,8 @@ static int balance_nonroot(
MemPage *pParent, /* Parent page of siblings being balanced */
int iParentIdx, /* Index of "the page" in pParent */
u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
int isRoot /* True if pParent is a root-page */
int isRoot, /* True if pParent is a root-page */
int bBulk /* True if this call is part of a bulk load */
){
BtShared *pBt; /* The whole database */
int nCell = 0; /* Number of cells in apCell[] */
@ -6257,7 +6258,7 @@ static int balance_nonroot(
if( rc ) goto balance_cleanup;
}else{
assert( i>0 );
rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
if( rc ) goto balance_cleanup;
apNew[i] = pNew;
nNew++;
@ -6707,7 +6708,7 @@ static int balance(BtCursor *pCur){
** pSpace buffer passed to the latter call to balance_nonroot().
*/
u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints);
if( pFree ){
/* If pFree is not NULL, it points to the pSpace buffer used
** by a previous call to balance_nonroot(). Its contents are
@ -8294,3 +8295,13 @@ int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
pBt->btsFlags &= ~BTS_NO_WAL;
return rc;
}
/*
** set the mask of hint flags for cursor pCsr. Currently the only valid
** values are 0 and BTREE_BULKLOAD.
*/
void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
assert( mask==BTREE_BULKLOAD || mask==0 );
pCsr->hints = mask;
}

View File

@ -135,6 +135,12 @@ int sqlite3BtreeUpdateMeta(Btree*, int idx, u32 value);
#define BTREE_USER_VERSION 6
#define BTREE_INCR_VACUUM 7
/*
** Values that may be OR'd together to form the second argument of an
** sqlite3BtreeCursorHints() call.
*/
#define BTREE_BULKLOAD 0x00000001
int sqlite3BtreeCursor(
Btree*, /* BTree containing table to open */
int iTable, /* Index of root page */
@ -178,8 +184,8 @@ struct Pager *sqlite3BtreePager(Btree*);
int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*);
void sqlite3BtreeCacheOverflow(BtCursor *);
void sqlite3BtreeClearCursor(BtCursor *);
int sqlite3BtreeSetVersion(Btree *pBt, int iVersion);
void sqlite3BtreeCursorHints(BtCursor *, unsigned int mask);
#ifndef NDEBUG
int sqlite3BtreeCursorIsValid(BtCursor*);

View File

@ -510,6 +510,7 @@ struct BtCursor {
#ifndef SQLITE_OMIT_INCRBLOB
u8 isIncrblobHandle; /* True if this cursor is an incr. io handle */
#endif
u8 hints; /* As configured by CursorSetHints() */
i16 iPage; /* Index of current page in apPage */
u16 aiIdx[BTCURSOR_MAX_DEPTH]; /* Current index in apPage[i] */
MemPage *apPage[BTCURSOR_MAX_DEPTH]; /* Pages from root to current page */

View File

@ -1581,7 +1581,7 @@ void sqlite3EndTable(
assert(pParse->nTab==1);
sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb);
sqlite3VdbeChangeP5(v, 1);
sqlite3VdbeChangeP5(v, OPFLAG_P2ISREG);
pParse->nTab = 2;
sqlite3SelectDestInit(&dest, SRT_Table, 1);
sqlite3Select(pParse, pSelect, &dest);
@ -2397,9 +2397,7 @@ static void sqlite3RefillIndex(Parse *pParse, Index *pIndex, int memRootPage){
pKey = sqlite3IndexKeyinfo(pParse, pIndex);
sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb,
(char *)pKey, P4_KEYINFO_HANDOFF);
if( memRootPage>=0 ){
sqlite3VdbeChangeP5(v, 1);
}
sqlite3VdbeChangeP5(v, OPFLAG_BULKCSR|((memRootPage>=0)?OPFLAG_P2ISREG:0));
#ifndef SQLITE_OMIT_MERGE_SORT
/* Open the sorter cursor if we are to use one. */

View File

@ -2317,6 +2317,8 @@ struct AuthContext {
#define OPFLAG_CLEARCACHE 0x20 /* Clear pseudo-table cache in OP_Column */
#define OPFLAG_LENGTHARG 0x40 /* OP_Column only used for length() */
#define OPFLAG_TYPEOFARG 0x80 /* OP_Column only used for typeof() */
#define OPFLAG_BULKCSR 0x01 /* OP_Open** used to open bulk cursor */
#define OPFLAG_P2ISREG 0x02 /* P2 to OP_Open** is a register number */
/*
* Each trigger present in the database schema is stored as an instance of

View File

@ -361,7 +361,8 @@ static int tvfsWrite(
if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){
tvfsExecTcl(p, "xWrite",
Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 0
Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId,
Tcl_NewWideIntObj(iOfst)
);
tvfsResultCode(p, &rc);
}

View File

@ -3120,6 +3120,9 @@ case OP_OpenWrite: {
VdbeCursor *pCur;
Db *pDb;
assert( (pOp->p5&(OPFLAG_P2ISREG|OPFLAG_BULKCSR))==pOp->p5 );
assert( pOp->opcode==OP_OpenWrite || pOp->p5==0 );
if( p->expired ){
rc = SQLITE_ABORT;
break;
@ -3143,7 +3146,7 @@ case OP_OpenWrite: {
}else{
wrFlag = 0;
}
if( pOp->p5 ){
if( pOp->p5 & OPFLAG_P2ISREG ){
assert( p2>0 );
assert( p2<=p->nMem );
pIn2 = &aMem[p2];
@ -3174,6 +3177,8 @@ case OP_OpenWrite: {
pCur->isOrdered = 1;
rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor);
pCur->pKeyInfo = pKeyInfo;
assert( OPFLAG_BULKCSR==BTREE_BULKLOAD );
sqlite3BtreeCursorHints(pCur->pCursor, (pOp->p5 & OPFLAG_BULKCSR));
/* Since it performs no memory allocation or IO, the only value that
** sqlite3BtreeCursor() may return is SQLITE_OK. */

View File

@ -22,6 +22,7 @@
typedef struct VdbeSorterIter VdbeSorterIter;
typedef struct SorterRecord SorterRecord;
typedef struct FileWriter FileWriter;
/*
** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
@ -119,6 +120,22 @@ struct VdbeSorterIter {
sqlite3_file *pFile; /* File iterator is reading from */
u8 *aAlloc; /* Allocated space */
u8 *aKey; /* Pointer to current key */
u8 *aBuffer; /* Current read buffer */
int nBuffer; /* Size of read buffer in bytes */
};
/*
** An instance of this structure is used to separate the stream of records
** being written to files by the merge-sort code into aligned, page-sized
** blocks.
*/
struct FileWriter {
u8 *aBuffer; /* Pointer to write buffer */
int nBuffer; /* Size of write buffer in bytes */
int iBufStart; /* First byte of buffer to write */
int iBufEnd; /* Last byte of buffer to write */
i64 iWriteOff; /* Offset of start of buffer in file */
sqlite3_file *pFile; /* File to write to */
};
/*
@ -144,9 +161,125 @@ struct SorterRecord {
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
sqlite3DbFree(db, pIter->aAlloc);
sqlite3DbFree(db, pIter->aBuffer);
memset(pIter, 0, sizeof(VdbeSorterIter));
}
/*
** Read nByte bytes of data from the stream of data iterated by object p.
** If successful, set *ppOut to point to a buffer containing the data
** and return SQLITE_OK. Otherwise, if an error occurs, return an SQLite
** error code.
**
** The buffer indicated by *ppOut may only be considered valid until the
** next call to this function.
*/
static int vdbeSorterIterRead(
sqlite3 *db, /* Database handle (for malloc) */
VdbeSorterIter *p, /* Iterator */
int nByte, /* Bytes of data to read */
u8 **ppOut /* OUT: Pointer to buffer containing data */
){
int iBuf; /* Offset within buffer to read from */
int nAvail; /* Bytes of data available in buffer */
assert( p->aBuffer );
/* If there is no more data to be read from the buffer, read the next
** p->nBuffer bytes of data from the file into it. Or, if there are less
** than p->nBuffer bytes remaining in the PMA, read all remaining data. */
iBuf = p->iReadOff % p->nBuffer;
if( iBuf==0 ){
int nRead; /* Bytes to read from disk */
int rc; /* sqlite3OsRead() return code */
/* Determine how many bytes of data to read. */
nRead = p->iEof - p->iReadOff;
if( nRead>p->nBuffer ) nRead = p->nBuffer;
assert( nRead>0 );
/* Read data from the file. Return early if an error occurs. */
rc = sqlite3OsRead(p->pFile, p->aBuffer, nRead, p->iReadOff);
assert( rc!=SQLITE_IOERR_SHORT_READ );
if( rc!=SQLITE_OK ) return rc;
}
nAvail = p->nBuffer - iBuf;
if( nByte<=nAvail ){
/* The requested data is available in the in-memory buffer. In this
** case there is no need to make a copy of the data, just return a
** pointer into the buffer to the caller. */
*ppOut = &p->aBuffer[iBuf];
p->iReadOff += nByte;
}else{
/* The requested data is not all available in the in-memory buffer.
** In this case, allocate space at p->aAlloc[] to copy the requested
** range into. Then return a copy of pointer p->aAlloc to the caller. */
int nRem; /* Bytes remaining to copy */
/* Extend the p->aAlloc[] allocation if required. */
if( p->nAlloc<nByte ){
int nNew = p->nAlloc*2;
while( nByte>nNew ) nNew = nNew*2;
p->aAlloc = sqlite3DbReallocOrFree(db, p->aAlloc, nNew);
if( !p->aAlloc ) return SQLITE_NOMEM;
p->nAlloc = nNew;
}
/* Copy as much data as is available in the buffer into the start of
** p->aAlloc[]. */
memcpy(p->aAlloc, &p->aBuffer[iBuf], nAvail);
p->iReadOff += nAvail;
nRem = nByte - nAvail;
/* The following loop copies up to p->nBuffer bytes per iteration into
** the p->aAlloc[] buffer. */
while( nRem>0 ){
int rc; /* vdbeSorterIterRead() return code */
int nCopy; /* Number of bytes to copy */
u8 *aNext; /* Pointer to buffer to copy data from */
nCopy = nRem;
if( nRem>p->nBuffer ) nCopy = p->nBuffer;
rc = vdbeSorterIterRead(db, p, nCopy, &aNext);
if( rc!=SQLITE_OK ) return rc;
assert( aNext!=p->aAlloc );
memcpy(&p->aAlloc[nByte - nRem], aNext, nCopy);
nRem -= nCopy;
}
*ppOut = p->aAlloc;
}
return SQLITE_OK;
}
/*
** Read a varint from the stream of data accessed by p. Set *pnOut to
** the value read.
*/
static int vdbeSorterIterVarint(sqlite3 *db, VdbeSorterIter *p, u64 *pnOut){
int iBuf;
iBuf = p->iReadOff % p->nBuffer;
if( iBuf && (p->nBuffer-iBuf)>=9 ){
p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
}else{
u8 aVarint[9];
int i;
for(i=0; i<sizeof(aVarint); i++){
u8 *a;
int rc = vdbeSorterIterRead(db, p, 1, &a);
if( rc ) return rc;
aVarint[i] = *a;
if( (aVarint[i] & 0x80)==0 ) break;
}
sqlite3GetVarint(aVarint, pnOut);
}
return SQLITE_OK;
}
/*
** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
** no error occurs, or an SQLite error code if one does.
@ -156,96 +289,18 @@ static int vdbeSorterIterNext(
VdbeSorterIter *pIter /* Iterator to advance */
){
int rc; /* Return Code */
int nRead; /* Number of bytes read */
int nRec = 0; /* Size of record in bytes */
int iOff = 0; /* Size of serialized size varint in bytes */
u64 nRec = 0; /* Size of record in bytes */
assert( pIter->iEof>=pIter->iReadOff );
if( pIter->iEof-pIter->iReadOff>5 ){
nRead = 5;
}else{
nRead = (int)(pIter->iEof - pIter->iReadOff);
}
if( nRead<=0 ){
if( pIter->iReadOff>=pIter->iEof ){
/* This is an EOF condition */
vdbeSorterIterZero(db, pIter);
return SQLITE_OK;
}
rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
rc = vdbeSorterIterVarint(db, pIter, &nRec);
if( rc==SQLITE_OK ){
iOff = getVarint32(pIter->aAlloc, nRec);
if( (iOff+nRec)>nRead ){
int nRead2; /* Number of extra bytes to read */
if( (iOff+nRec)>pIter->nAlloc ){
int nNew = pIter->nAlloc*2;
while( (iOff+nRec)>nNew ) nNew = nNew*2;
pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
if( !pIter->aAlloc ) return SQLITE_NOMEM;
pIter->nAlloc = nNew;
}
nRead2 = iOff + nRec - nRead;
rc = sqlite3OsRead(
pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
);
}
}
assert( rc!=SQLITE_OK || nRec>0 );
pIter->iReadOff += iOff+nRec;
pIter->nKey = nRec;
pIter->aKey = &pIter->aAlloc[iOff];
return rc;
}
/*
** Write a single varint, value iVal, to file-descriptor pFile. Return
** SQLITE_OK if successful, or an SQLite error code if some error occurs.
**
** The value of *piOffset when this function is called is used as the byte
** offset in file pFile to write to. Before returning, *piOffset is
** incremented by the number of bytes written.
*/
static int vdbeSorterWriteVarint(
sqlite3_file *pFile, /* File to write to */
i64 iVal, /* Value to write as a varint */
i64 *piOffset /* IN/OUT: Write offset in file pFile */
){
u8 aVarint[9]; /* Buffer large enough for a varint */
int nVarint; /* Number of used bytes in varint */
int rc; /* Result of write() call */
nVarint = sqlite3PutVarint(aVarint, iVal);
rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
*piOffset += nVarint;
return rc;
}
/*
** Read a single varint from file-descriptor pFile. Return SQLITE_OK if
** successful, or an SQLite error code if some error occurs.
**
** The value of *piOffset when this function is called is used as the
** byte offset in file pFile from whence to read the varint. If successful
** (i.e. if no IO error occurs), then *piOffset is set to the offset of
** the first byte past the end of the varint before returning. *piVal is
** set to the integer value read. If an error occurs, the final values of
** both *piOffset and *piVal are undefined.
*/
static int vdbeSorterReadVarint(
sqlite3_file *pFile, /* File to read from */
i64 *piOffset, /* IN/OUT: Read offset in pFile */
i64 *piVal /* OUT: Value read from file */
){
u8 aVarint[9]; /* Buffer large enough for a varint */
i64 iOff = *piOffset; /* Offset in file to read from */
int rc; /* Return code */
rc = sqlite3OsRead(pFile, aVarint, 9, iOff);
if( rc==SQLITE_OK ){
*piOffset += getVarint(aVarint, (u64 *)piVal);
pIter->nKey = (int)nRec;
rc = vdbeSorterIterRead(db, pIter, nRec, &pIter->aKey);
}
return rc;
@ -264,22 +319,47 @@ static int vdbeSorterIterInit(
VdbeSorterIter *pIter, /* Iterator to populate */
i64 *pnByte /* IN/OUT: Increment this value by PMA size */
){
int rc;
int rc = SQLITE_OK;
int nBuf;
nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
assert( pSorter->iWriteOff>iStart );
assert( pIter->aAlloc==0 );
assert( pIter->aBuffer==0 );
pIter->pFile = pSorter->pTemp1;
pIter->iReadOff = iStart;
pIter->nAlloc = 128;
pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
if( !pIter->aAlloc ){
pIter->nBuffer = nBuf;
pIter->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
if( !pIter->aBuffer ){
rc = SQLITE_NOMEM;
}else{
i64 nByte; /* Total size of PMA in bytes */
rc = vdbeSorterReadVarint(pSorter->pTemp1, &pIter->iReadOff, &nByte);
*pnByte += nByte;
pIter->iEof = pIter->iReadOff + nByte;
int iBuf;
iBuf = iStart % nBuf;
if( iBuf ){
int nRead = nBuf - iBuf;
if( (iStart + nRead) > pSorter->iWriteOff ){
nRead = pSorter->iWriteOff - iStart;
}
rc = sqlite3OsRead(
pSorter->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart
);
assert( rc!=SQLITE_IOERR_SHORT_READ );
}
if( rc==SQLITE_OK ){
u64 nByte; /* Size of PMA in bytes */
pIter->iEof = pSorter->iWriteOff;
rc = vdbeSorterIterVarint(db, pIter, &nByte);
pIter->iEof = pIter->iReadOff + nByte;
*pnByte += nByte;
}
}
if( rc==SQLITE_OK ){
rc = vdbeSorterIterNext(db, pIter);
}
@ -531,6 +611,92 @@ static int vdbeSorterSort(const VdbeCursor *pCsr){
return SQLITE_OK;
}
/*
** Initialize a file-writer object.
*/
static int fileWriterInit(
sqlite3 *db, /* Database (for malloc) */
sqlite3_file *pFile, /* File to write to */
FileWriter *p, /* Object to populate */
i64 iStart /* Offset of pFile to begin writing at */
){
int nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
memset(p, 0, sizeof(FileWriter));
p->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
if( !p->aBuffer ) return SQLITE_NOMEM;
p->iBufEnd = p->iBufStart = (iStart % nBuf);
p->iWriteOff = iStart - p->iBufStart;
p->nBuffer = nBuf;
p->pFile = pFile;
return SQLITE_OK;
}
/*
** Write nData bytes of data to the file-write object. Return SQLITE_OK
** if successful, or an SQLite error code if an error occurs.
*/
static int fileWriterWrite(FileWriter *p, u8 *pData, int nData){
int nRem = nData;
while( nRem>0 ){
int nCopy = nRem;
if( nCopy>(p->nBuffer - p->iBufEnd) ){
nCopy = p->nBuffer - p->iBufEnd;
}
memcpy(&p->aBuffer[p->iBufEnd], &pData[nData-nRem], nCopy);
p->iBufEnd += nCopy;
if( p->iBufEnd==p->nBuffer ){
int rc = sqlite3OsWrite(p->pFile,
&p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart,
p->iWriteOff + p->iBufStart
);
if( rc!=SQLITE_OK ) return rc;
p->iBufStart = p->iBufEnd = 0;
p->iWriteOff += p->nBuffer;
}
assert( p->iBufEnd<p->nBuffer );
nRem -= nCopy;
}
return SQLITE_OK;
}
/*
** Flush any buffered data to disk and clean up the file-writer object.
** The results of using the file-writer after this call are undefined.
** Return SQLITE_OK if flushing the buffered data succeeds or is not
** required. Otherwise, return an SQLite error code.
**
** Before returning, set *piEof to the offset immediately following the
** last byte written to the file.
*/
static int fileWriterFinish(sqlite3 *db, FileWriter *p, i64 *piEof){
int rc = SQLITE_OK;
if( p->aBuffer && p->iBufEnd>p->iBufStart ){
rc = sqlite3OsWrite(p->pFile,
&p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart,
p->iWriteOff + p->iBufStart
);
}
*piEof = (p->iWriteOff + p->iBufEnd);
sqlite3DbFree(db, p->aBuffer);
memset(p, 0, sizeof(FileWriter));
return rc;
}
/*
** Write value iVal encoded as a varint to the file-write object. Return
** SQLITE_OK if successful, or an SQLite error code if an error occurs.
*/
static int fileWriterWriteVarint(FileWriter *p, u64 iVal){
int nByte;
u8 aByte[10];
nByte = sqlite3PutVarint(aByte, iVal);
return fileWriterWrite(p, aByte, nByte);
}
/*
** Write the current contents of the in-memory linked-list to a PMA. Return
@ -547,7 +713,11 @@ static int vdbeSorterSort(const VdbeCursor *pCsr){
*/
static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){
int rc = SQLITE_OK; /* Return code */
int rc2; /* fileWriterFinish return code */
VdbeSorter *pSorter = pCsr->pSorter;
FileWriter writer;
memset(&writer, 0, sizeof(FileWriter));
if( pSorter->nInMemory==0 ){
assert( pSorter->pRecord==0 );
@ -565,41 +735,32 @@ static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){
}
if( rc==SQLITE_OK ){
i64 iOff = pSorter->iWriteOff;
rc = fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
}
if( rc==SQLITE_OK ){
SorterRecord *p;
SorterRecord *pNext = 0;
static const char eightZeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
pSorter->nPMA++;
rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nInMemory, &iOff);
rc = fileWriterWriteVarint(&writer, pSorter->nInMemory);
for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
pNext = p->pNext;
rc = vdbeSorterWriteVarint(pSorter->pTemp1, p->nVal, &iOff);
rc = fileWriterWriteVarint(&writer, p->nVal);
if( rc==SQLITE_OK ){
rc = sqlite3OsWrite(pSorter->pTemp1, p->pVal, p->nVal, iOff);
iOff += p->nVal;
rc = fileWriterWrite(&writer, p->pVal, p->nVal);
}
sqlite3DbFree(db, p);
}
/* This assert verifies that unless an error has occurred, the size of
** the PMA on disk is the same as the expected size stored in
** pSorter->nInMemory. */
assert( rc!=SQLITE_OK || pSorter->nInMemory==(
iOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nInMemory)
));
pSorter->iWriteOff = iOff;
if( rc==SQLITE_OK ){
/* Terminate each file with 8 extra bytes so that from any offset
** in the file we can always read 9 bytes without a SHORT_READ error */
rc = sqlite3OsWrite(pSorter->pTemp1, eightZeros, 8, iOff);
}
pSorter->pRecord = p;
}
rc2 = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
if( rc==SQLITE_OK ) rc = rc2;
return rc;
}
@ -642,8 +803,14 @@ int sqlite3VdbeSorterWrite(
(pSorter->nInMemory>pSorter->mxPmaSize)
|| (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
)){
#ifdef SQLITE_DEBUG
i64 nExpect = pSorter->iWriteOff
+ sqlite3VarintLen(pSorter->nInMemory)
+ pSorter->nInMemory;
#endif
rc = vdbeSorterListToPMA(db, pCsr);
pSorter->nInMemory = 0;
assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
}
return rc;
@ -704,7 +871,7 @@ int sqlite3VdbeSorterRewind(sqlite3 *db, const VdbeCursor *pCsr, int *pbEof){
return vdbeSorterSort(pCsr);
}
/* Write the current b-tree to a PMA. Close the b-tree cursor. */
/* Write the current in-memory list to a PMA. */
rc = vdbeSorterListToPMA(db, pCsr);
if( rc!=SQLITE_OK ) return rc;
@ -726,8 +893,12 @@ int sqlite3VdbeSorterRewind(sqlite3 *db, const VdbeCursor *pCsr, int *pbEof){
rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA;
iNew++
){
int rc2; /* Return code from fileWriterFinish() */
FileWriter writer; /* Object used to write to disk */
i64 nWrite; /* Number of bytes in new PMA */
memset(&writer, 0, sizeof(FileWriter));
/* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
** initialize an iterator for each of them and break out of the loop.
** These iterators will be incrementally merged as the VDBE layer calls
@ -750,23 +921,30 @@ int sqlite3VdbeSorterRewind(sqlite3 *db, const VdbeCursor *pCsr, int *pbEof){
}
if( rc==SQLITE_OK ){
rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
rc = fileWriterInit(db, pTemp2, &writer, iWrite2);
}
if( rc==SQLITE_OK ){
rc = fileWriterWriteVarint(&writer, nWrite);
}
if( rc==SQLITE_OK ){
int bEof = 0;
while( rc==SQLITE_OK && bEof==0 ){
int nToWrite;
VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
assert( pIter->pFile );
nToWrite = pIter->nKey + sqlite3VarintLen(pIter->nKey);
rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nToWrite, iWrite2);
iWrite2 += nToWrite;
rc = fileWriterWriteVarint(&writer, pIter->nKey);
if( rc==SQLITE_OK ){
rc = fileWriterWrite(&writer, pIter->aKey, pIter->nKey);
}
if( rc==SQLITE_OK ){
rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
}
}
}
rc2 = fileWriterFinish(db, &writer, &iWrite2);
if( rc==SQLITE_OK ) rc = rc2;
}
if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){

75
test/index5.test Normal file
View File

@ -0,0 +1,75 @@
# 2012 August 6
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#***********************************************************************
#
set testdir [file dirname $argv0]
source $testdir/tester.tcl
set ::testprefix index5
do_test 1.1 {
execsql {
PRAGMA page_size = 1024;
CREATE TABLE t1(x);
BEGIN;
}
for {set i 0} {$i < 100000} {incr i} {
execsql { INSERT INTO t1 VALUES(randstr(100,100)) }
}
execsql COMMIT
execsql {
CREATE INDEX i1 ON t1(x);
DROP INDEX I1;
PRAGMA main.page_size;
}
} {1024}
db close
testvfs tvfs
tvfs filter xWrite
tvfs script write_cb
proc write_cb {xCall file handle iOfst} {
if {[file tail $file]=="test.db"} {
lappend ::write_list [expr $iOfst/1024]
}
puts "$xCall $file $args"
}
do_test 1.2 {
sqlite3 db test.db -vfs tvfs
set ::write_list [list]
execsql { CREATE INDEX i1 ON t1(x) }
} {}
do_test 1.3 {
set nForward 0
set nBackward 0
set nNoncont 0
set iPrev [lindex $::write_list 0]
for {set i 1} {$i < [llength $::write_list]} {incr i} {
set iNext [lindex $::write_list $i]
if {$iNext==($iPrev+1)} {
incr nForward
} elseif {$iNext==($iPrev-1)} {
incr nBackward
} else {
incr nNoncont
}
set iPrev $iNext
}
expr {$nForward > $nBackward}
} {1}
db close
tvfs delete
finish_test