Modify the format of the wal-index to use a hash table to index log file segments.

FossilOrigin-Name: 40b0a6357b160e04326ab176955a68a1cf3f8b7c
This commit is contained in:
dan 2010-05-10 14:46:09 +00:00
parent acd0781892
commit bb23aff3df
3 changed files with 203 additions and 113 deletions

View File

@ -1,8 +1,5 @@
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
C If\san\sATTACH\scommand\sfiles\sdue\sto\sOP_JournalMode\sbut\sstill\sattaches\sthe\ndatabase,\smake\ssure\sVACUUM\sstill\sdetaches\sit\swhen\sdone.
D 2010-05-10T14:10:58
C Modify\sthe\sformat\sof\sthe\swal-index\sto\suse\sa\shash\stable\sto\sindex\slog\sfile\ssegments.
D 2010-05-10T14:46:09
F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0
F Makefile.in a5cad1f8f3e021356bfcc6c77dc16f6f1952bbc3
F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
@ -227,7 +224,7 @@ F src/vdbeblob.c 5327132a42a91e8b7acfb60b9d2c3b1c5c863e0e
F src/vdbemem.c 2a82f455f6ca6f78b59fb312f96054c04ae0ead1
F src/vdbetrace.c 864cef96919323482ebd9986f2132435115e9cc2
F src/vtab.c a0f8a40274e4261696ef57aa806de2776ab72cda
F src/wal.c 66147e8b8623050a970ac9ba64688b95eb4e3bc3
F src/wal.c 65d1376c8d5ce500ac1fd1bf49e2287ad04cf6c4
F src/wal.h b4c42014b5fa3b4e6244ac8c65de7ff67adeb27c
F src/walker.c 3112bb3afe1d85dc52317cb1d752055e9a781f8f
F src/where.c 75fee9e255b62f773fcadd1d1f25b6f63ac7a356
@ -816,14 +813,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff
F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224
F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
P 0bdea4cfbd7832f2a00c01b93c92ba13d20139ef
R 9014c21dcf58ddf23dc85907a38f6c75
U drh
Z d231509bf5d75d0a7182ae588ca4bb8b
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.6 (GNU/Linux)
iD4DBQFL6BP2oxKgR168RlERAignAJQNAle/gWkbKShnq3kcL+DtdcO2AJ9vguYT
n1LeBzEa/iP6Gu/jqjlBKQ==
=h4WY
-----END PGP SIGNATURE-----
P 6ecdc7ba2b5e79e8b5862fb49cf6c2b99a40659a
R d05ea03295a9fb019dfbc21cf1e7b6fc
U dan
Z acd3bcc163b4a5bf1cebc63a218dff15

View File

@ -1 +1 @@
6ecdc7ba2b5e79e8b5862fb49cf6c2b99a40659a
40b0a6357b160e04326ab176955a68a1cf3f8b7c

290
src/wal.c
View File

@ -356,6 +356,16 @@ static void walMergesort8(
#endif
}
/*
** Define the size of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
*/
#define HASHTABLE_NPAGE 4096
#define HASHTABLE_DATATYPE u16
#define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2)
#define HASHTABLE_NBYTE (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)
/*
** Return the index in the WalIndex.aData array that corresponds to
@ -365,8 +375,8 @@ static void walMergesort8(
static int walIndexEntry(u32 iFrame){
return (
(WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
+ (((iFrame-1)>>8)<<6) /* Indexes that occur before iFrame */
+ iFrame-1 /* Db page numbers that occur before iFrame */
+ (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32)
+ (iFrame-1)
);
}
@ -377,9 +387,10 @@ static int walIndexEntry(u32 iFrame){
** required by the 256-byte index block.
*/
static int walMappingSize(u32 iFrame){
return ( WALINDEX_LOCK_OFFSET + WALINDEX_LOCK_RESERVED
+ iFrame*sizeof(u32)
+ (iFrame>>8)*256
const int nByte = (sizeof(u32)*HASHTABLE_NPAGE + HASHTABLE_NBYTE) ;
return ( WALINDEX_LOCK_OFFSET
+ WALINDEX_LOCK_RESERVED
+ nByte * ((iFrame + HASHTABLE_NPAGE - 1)/HASHTABLE_NPAGE)
);
}
@ -441,6 +452,56 @@ static int walIndexRemap(Wal *pWal, int enlargeTo){
*/
#define WALINDEX_MMAP_INCREMENT (64*1024)
static int walHashKey(u32 iPage){
return (iPage*2) % (HASHTABLE_NSLOT-1);
}
/*
** Find the hash table and (section of the) page number array used to
** store data for WAL frame iFrame.
**
** Set output variable *paHash to point to the start of the hash table
** in the wal-index file. Set *piZero to one less than the frame
** number of the first frame indexed by this hash table. If a
** slot in the hash table is set to N, it refers to frame number
** (*piZero+N) in the log.
**
** Finally, set *paPgno such that for all frames F between (*piZero+1) and
** (*piZero+HASHTABLE_NPAGE), (*paPgno)[F] is the database page number
** associated with frame F.
*/
static void walHashFind(
Wal *pWal, /* WAL handle */
u32 iFrame, /* Find the hash table indexing this frame */
HASHTABLE_DATATYPE **paHash, /* OUT: Pointer to hash index */
u32 **paPgno, /* OUT: Pointer to page number array */
u32 *piZero /* OUT: Frame associated with *paPgno[0] */
){
u32 iZero;
u32 *aPgno;
HASHTABLE_DATATYPE *aHash;
iZero = ((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NPAGE;
aPgno = &pWal->pWiData[walIndexEntry(iZero+1)-iZero-1];
aHash = (HASHTABLE_DATATYPE *)&aPgno[iZero+HASHTABLE_NPAGE+1];
/* Assert that:
**
** + the mapping is large enough for this hash-table, and
**
** + that aPgno[iZero+1] really is the database page number associated
** with the first frame indexed by this hash table.
*/
assert( (u32*)(&aHash[HASHTABLE_NSLOT])<=&pWal->pWiData[pWal->szWIndex/4] );
assert( walIndexEntry(iZero+1)==(&aPgno[iZero+1] - pWal->pWiData) );
*paHash = aHash;
*paPgno = aPgno;
*piZero = iZero;
}
/*
** Set an entry in the wal-index map to map log frame iFrame to db
** page iPage. Values are always appended to the wal-index (i.e. the
@ -449,47 +510,38 @@ static int walIndexRemap(Wal *pWal, int enlargeTo){
** here.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
int rc;
u32 iSlot = walIndexEntry(iFrame);
int rc; /* Return code */
int nMapping; /* Required mapping size in bytes */
/* Make sure the wal-index is mapped. Enlarge the mapping if required. */
nMapping = walMappingSize(iFrame);
rc = walIndexMap(pWal, -1);
if( rc!=SQLITE_OK ){
return rc;
}
while( ((iSlot+128)*sizeof(u32))>=pWal->szWIndex ){
while( rc==SQLITE_OK && nMapping>pWal->szWIndex ){
int nByte = pWal->szWIndex + WALINDEX_MMAP_INCREMENT;
/* Enlarge the storage, then remap it. */
rc = walIndexRemap(pWal, nByte);
if( rc!=SQLITE_OK ){
return rc;
}
}
/* Set the wal-index entry itself */
pWal->pWiData[iSlot] = iPage;
/* If the frame number is a multiple of 256 (frames are numbered starting
** at 1), build an index of the most recently added 256 frames.
/* Assuming the wal-index file was successfully mapped, find the hash
** table and section of of the page number array that pertain to frame
** iFrame of the WAL. Then populate the page number array and the hash
** table entry.
*/
if( (iFrame&0x000000FF)==0 ){
int i; /* Iterator used while initializing aIndex */
u32 *aFrame; /* Pointer to array of 256 frames */
int nIndex; /* Number of entries in index */
u8 *aIndex; /* 256 bytes to build index in */
u8 *aTmp; /* Scratch space to use while sorting */
if( rc==SQLITE_OK ){
int iKey; /* Hash table key */
u32 iZero; /* One less than frame number of aPgno[1] */
u32 *aPgno; /* Page number array */
HASHTABLE_DATATYPE *aHash; /* Hash table */
int idx; /* Value to write to hash-table slot */
aFrame = &pWal->pWiData[iSlot-255];
aIndex = (u8 *)&pWal->pWiData[iSlot+1];
aTmp = &aIndex[256];
nIndex = 256;
for(i=0; i<256; i++) aIndex[i] = (u8)i;
walMergesort8(aFrame, aTmp, aIndex, &nIndex);
memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
idx = iFrame - iZero;
if( idx==1 ) memset(aHash, 0, HASHTABLE_NBYTE);
aPgno[iFrame] = iPage;
for(iKey=walHashKey(iPage); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT);
aHash[iKey] = idx;
}
return SQLITE_OK;
return rc;
}
@ -684,7 +736,6 @@ static int walIteratorNext(
}
pSegment->iNext++;
}
nBlock = 256;
}
@ -700,7 +751,6 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){
int nByte; /* Number of bytes to allocate */
int i; /* Iterator variable */
int nFinal; /* Number of unindexed entries */
struct WalSegment *pFinal; /* Final (unindexed) segment */
u8 *aTmp; /* Temp space used by merge-sort */
int rc; /* Return code of walIndexMap() */
@ -713,28 +763,30 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){
nSegment = (iLast >> 8) + 1;
nFinal = (iLast & 0x000000FF);
nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + 512;
nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256);
p = (WalIterator *)sqlite3_malloc(nByte);
if( !p ){
rc = SQLITE_NOMEM;
}else{
u8 *aSpace;
memset(p, 0, nByte);
p->nSegment = nSegment;
for(i=0; i<nSegment-1; i++){
aSpace = (u8 *)&p->aSegment[nSegment];
aTmp = &aSpace[nSegment*256];
for(i=0; i<nSegment; i++){
int j;
int nIndex = (i==nSegment-1) ? nFinal : 256;
p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)];
p->aSegment[i].aIndex = (u8 *)&aData[walIndexEntry(i*256+1)+256];
p->aSegment[i].aIndex = aSpace;
for(j=0; j<nIndex; j++){
aSpace[j] = j;
}
walMergesort8(p->aSegment[i].aDbPage, aTmp, aSpace, &nIndex);
memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex);
aSpace += 256;
p->nFinal = nIndex;
}
pFinal = &p->aSegment[nSegment-1];
pFinal->aDbPage = &aData[walIndexEntry((nSegment-1)*256+1)];
pFinal->aIndex = (u8 *)&pFinal[1];
aTmp = &pFinal->aIndex[256];
for(i=0; i<nFinal; i++){
pFinal->aIndex[i] = i;
}
walMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
p->nFinal = nFinal;
}
*pp = p;
@ -1020,70 +1072,118 @@ void sqlite3WalCloseSnapshot(Wal *pWal){
** Read a page from the log, if it is present.
*/
int sqlite3WalRead(
Wal *pWal,
Pgno pgno,
int *pInWal,
int nOut,
u8 *pOut
Wal *pWal, /* WAL handle */
Pgno pgno, /* Database page number to read data for */
int *pInWal, /* OUT: True if data is read from WAL */
int nOut, /* Size of buffer pOut in bytes */
u8 *pOut /* Buffer to write page data to */
){
int rc; /* Return code */
u32 iRead = 0;
u32 *aData;
int iFrame = (pWal->hdr.iLastPg & 0xFFFFFF00);
u32 iRead = 0; /* If !=0, WAL frame to return data from */
u32 iLast = pWal->hdr.iLastPg; /* Last page in WAL for this reader */
int iHash; /* Used to loop through N hash tables */
/* If the "last page" field of the wal-index header snapshot is 0, then
** no data will be read from the wal under any circumstances. Return early
** in this case to avoid the walIndexMap/Unmap overhead.
*/
if( iLast==0 ){
*pInWal = 0;
return SQLITE_OK;
}
/* Ensure the wal-index is mapped. */
assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
rc = walIndexMap(pWal, walMappingSize(iLast));
if( rc!=SQLITE_OK ){
return rc;
}
/* Do a linear search of the unindexed block of page-numbers (if any)
** at the end of the wal-index. An alternative to this would be to
** build an index in private memory each time a read transaction is
** opened on a new snapshot.
/* Search the hash table or tables for an entry matching page number
** pgno. Each iteration of the following for() loop searches one
** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
**
** This code may run concurrently to the code in walIndexAppend()
** that adds entries to the wal-index (and possibly to this hash
** table). This means the non-zero value just read from the hash
** slot (aHash[iKey]) may have been added before or after the
** current read transaction was opened. Values added after the
** read transaction was opened may have been written incorrectly -
** i.e. these slots may contain garbage data. However, we assume
** that any slots written before the current read transaction was
** opened remain unmodified.
**
** For the reasons above, the if(...) condition featured in the inner
** loop of the following block is more stringent that would be required
** if we had exclusive access to the hash-table:
**
** (aPgno[iFrame]==pgno):
** This condition filters out normal hash-table collisions.
**
** (iFrame<=iLast):
** This condition filters out entries that were added to the hash
** table after the current read-transaction had started.
**
** (iFrame>iRead):
** This filters out a dangerous class of garbage data. The
** garbage hash slot may refer to a frame with the correct page
** number, but not the most recent version of the frame. For
** example, if at the start of the read-transaction the log
** contains three copies of the desired page in frames 2, 3 and 4,
** the hash table may contain the following:
**
** { ..., 2, 3, 4, 0, 0, ..... }
**
** The correct answer is to read data from frame 4. But a
** dirty-read may potentially cause the hash-table to appear as
** follows to the reader:
**
** { ..., 2, 3, 4, 3, 0, ..... }
**
** Without this part of the if(...) clause, the reader might
** incorrectly read data from frame 3 instead of 4. This would be
** an error.
**
** It is not actually clear to the developers that such a dirty-read
** can occur. But if it does, it should not cause any problems.
*/
aData = pWal->pWiData;
if( pWal->hdr.iLastPg ){
u32 *pi = &aData[walIndexEntry(pWal->hdr.iLastPg)];
u32 *piStop = pi - (pWal->hdr.iLastPg & 0xFF);
while( *pi!=pgno && pi!=piStop ) pi--;
if( pi!=piStop ){
iRead = (pi-piStop) + iFrame;
for(iHash=iLast; iHash>0 && iRead==0; iHash-=HASHTABLE_NPAGE){
HASHTABLE_DATATYPE *aHash; /* Pointer to hash table */
u32 *aPgno; /* Pointer to array of page numbers */
u32 iZero; /* Frame number corresponding to aPgno[0] */
int iKey; /* Hash slot index */
walHashFind(pWal, iHash, &aHash, &aPgno, &iZero);
for(iKey=walHashKey(pgno); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT){
u32 iFrame = aHash[iKey] + iZero;
if( iFrame<=iLast && aPgno[iFrame]==pgno && iFrame>iRead ){
iRead = iFrame;
}
}
}
assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
assert( iRead==0 || pWal->pWiData[walIndexEntry(iRead)]==pgno );
while( iRead==0 && iFrame>0 ){
int iLow = 0;
int iHigh = 255;
u32 *aFrame;
u8 *aIndex;
iFrame -= 256;
aFrame = &aData[walIndexEntry(iFrame+1)];
aIndex = (u8 *)&aFrame[256];
while( iLow<=iHigh ){
int iTest = (iLow+iHigh)>>1;
u32 iPg = aFrame[aIndex[iTest]];
if( iPg==pgno ){
iRead = iFrame + 1 + aIndex[iTest];
#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
/* If expensive assert() statements are available, do a linear search
** of the wal-index file content. Make sure the results agree with the
** result obtained using the hash indexes above. */
{
u32 iRead2 = 0;
u32 iTest;
for(iTest=iLast; iTest>0; iTest--){
if( pWal->pWiData[walIndexEntry(iTest)]==pgno ){
iRead2 = iTest;
break;
}
else if( iPg<pgno ){
iLow = iTest+1;
}else{
iHigh = iTest-1;
}
}
assert( iRead==iRead2 );
}
assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
walIndexUnmap(pWal);
#endif
/* If iRead is non-zero, then it is the log frame number that contains the
** required page. Read and return data from the log file.
*/
walIndexUnmap(pWal);
if( iRead ){
i64 iOffset = walFrameOffset(iRead, pWal->hdr.pgsz) + WAL_FRAME_HDRSIZE;
*pInWal = 1;