From 998147ec38ce64fff6307891b067192d2c089097 Mon Sep 17 00:00:00 2001 From: drh Date: Thu, 10 Dec 2015 02:15:03 +0000 Subject: [PATCH] Add the nBackfillAttempted field in formerly unused space in WalCkptInfo and use that field to close the race condition on opening a snapshot. FossilOrigin-Name: cb68e9d0738fc7db7316947b4d2aab91aae819f2 --- manifest | 14 ++-- manifest.uuid | 2 +- src/wal.c | 214 +++++++++++++++++++++++++------------------------- 3 files changed, 116 insertions(+), 114 deletions(-) diff --git a/manifest b/manifest index 09955e2042..32f43abbab 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Update\ssqlite3_snapshot_open()\sto\sreduce\sthe\schances\sof\sreading\sa\scorrupt\ssnapshot\screated\sby\sa\scheckpointer\sprocess\sexiting\sunexpectedly. -D 2015-12-09T20:05:27.534 +C Add\sthe\snBackfillAttempted\sfield\sin\sformerly\sunused\sspace\sin\sWalCkptInfo\sand\nuse\sthat\sfield\sto\sclose\sthe\srace\scondition\son\sopening\sa\ssnapshot. +D 2015-12-10T02:15:03.333 F Makefile.in 28bcd6149e050dff35d4dcfd97e890cd387a499d F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434 F Makefile.msc e8fdca1cb89a1b58b5f4d3a130ea9a3d28cb314d @@ -415,7 +415,7 @@ F src/vdbesort.c a7ec02da4494c59dfd071126dd3726be5a11459d F src/vdbetrace.c 8befe829faff6d9e6f6e4dee5a7d3f85cc85f1a0 F src/vtab.c 2a8b44aa372c33f6154208e7a7f6c44254549806 F src/vxworks.h c18586c8edc1bddbc15c004fa16aeb1e1342b4fb -F src/wal.c 0bd8aa8e0db924493af4c72f527afc9b9e22257a +F src/wal.c 115765a38fa4a03d7334b6ba77db0cedae682eb0 F src/wal.h 907943dfdef10b583e81906679a347e0ec6f1b1b F src/walker.c 2e14d17f592d176b6dc879c33fbdec4fbccaa2ba F src/where.c b18edbb9e5afabb77f4f27550c471c5c824e0fe7 @@ -1409,7 +1409,7 @@ F tool/vdbe_profile.tcl 246d0da094856d72d2c12efec03250d71639d19f F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P 362615b4df94358d0264b0991c3090a0878f054c -R cfc950fbd468350b7777662be2b401a3 -U dan -Z 51de2f0ad57bbe117dc864890e31b83c +P 7315f7cbf4179aadda0f1a0baa1526a9b9f9729f +R 70ffd1e2449a67c034eb4185ab7bdad1 +U drh +Z 159fdbba63f7473f88a04e9240766d6d diff --git a/manifest.uuid b/manifest.uuid index 33aefc5d3b..028205ec43 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -7315f7cbf4179aadda0f1a0baa1526a9b9f9729f \ No newline at end of file +cb68e9d0738fc7db7316947b4d2aab91aae819f2 \ No newline at end of file diff --git a/src/wal.c b/src/wal.c index af93694e8d..784f993bfc 100644 --- a/src/wal.c +++ b/src/wal.c @@ -272,7 +272,8 @@ int sqlite3WalTrace = 0; /* ** Indices of various locking bytes. WAL_NREADER is the number -** of available reader locks and should be at least 3. +** of available reader locks and should be at least 3. The default +** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5. */ #define WAL_WRITE_LOCK 0 #define WAL_ALL_BUT_WRITE 1 @@ -292,7 +293,10 @@ typedef struct WalCkptInfo WalCkptInfo; ** The following object holds a copy of the wal-index header content. ** ** The actual header in the wal-index consists of two copies of this -** object. +** object followed by one instance of the WalCkptInfo object. +** For all versions of SQLite through 3.10.0 and probably beyond, +** the locking bytes (WalCkptInfo.aLock) start at offset 120 and +** the total header size is 136 bytes. ** ** The szPage value can be any power of 2 between 512 and 32768, inclusive. ** Or it can be 1 to represent a 65536-byte page. The latter case was @@ -325,6 +329,16 @@ struct WalIndexHdr { ** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from ** mxFrame back to zero when the WAL is reset. ** +** nBackfillAttempted is the largest value of nBackfill that a checkpoint +** has attempted to achieve. Normally nBackfill==nBackfillAtempted, however +** the nBackfillAttempted is set before any backfilling is done and the +** nBackfill is only set afte rall backfilling completes. So if a checkpoint +** crashes, nBackfillAttempted might be larger than nBackfill. The +** WalIndexHdr.mxFrame must never be less than nBackfillAttempted. +** +** The aLock[] field is a set of bytes used for locking. These bytes should +** never be read or written. +** ** There is one entry in aReadMark[] for each reader lock. If a reader ** holds read-lock K, then the value in aReadMark[K] is no greater than ** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff) @@ -364,6 +378,9 @@ struct WalIndexHdr { struct WalCkptInfo { u32 nBackfill; /* Number of WAL frames backfilled into DB */ u32 aReadMark[WAL_NREADER]; /* Reader marks */ + u8 aLock[SQLITE_SHM_NLOCK]; /* Reserved space for locks */ + u32 nBackfillAttempted; /* WAL frames perhaps written, or maybe not */ + u32 notUsed0; /* Available for future enhancements */ }; #define READMARK_NOT_USED 0xffffffff @@ -373,9 +390,8 @@ struct WalCkptInfo { ** only support mandatory file-locks, we do not read or write data ** from the region of the file on which locks are applied. */ -#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo)) -#define WALINDEX_LOCK_RESERVED 16 -#define WALINDEX_HDR_SIZE (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED) +#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2+offsetof(WalCkptInfo,aLock)) +#define WALINDEX_HDR_SIZE (sizeof(WalIndexHdr)*2+sizeof(WalCkptInfo)) /* Size of header before each frame in wal */ #define WAL_FRAME_HDRSIZE 24 @@ -435,7 +451,7 @@ struct Wal { u8 lockError; /* True if a locking error has occurred */ #endif #ifdef SQLITE_ENABLE_SNAPSHOT - WalIndexHdr *pSnapshot; + WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */ #endif }; @@ -1201,6 +1217,7 @@ finished: */ pInfo = walCkptInfo(pWal); pInfo->nBackfill = 0; + pInfo->nBackfillAttempted = 0; pInfo->aReadMark[0] = 0; for(i=1; iaReadMark[i] = READMARK_NOT_USED; if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame; @@ -1272,7 +1289,11 @@ int sqlite3WalOpen( /* In the amalgamation, the os_unix.c and os_win.c source files come before ** this source file. Verify that the #defines of the locking byte offsets ** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value. + ** For that matter, if the lock offset ever changes from its initial design + ** value of 120, we need to know that so there is an assert() to check it. */ + assert( 120==WALINDEX_LOCK_OFFSET ); + assert( 136==WALINDEX_HDR_SIZE ); #ifdef WIN_SHM_BASE assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif @@ -1658,6 +1679,7 @@ static void walRestartHdr(Wal *pWal, u32 salt1){ memcpy(&pWal->hdr.aSalt[1], &salt1, 4); walIndexWriteHdr(pWal); pInfo->nBackfill = 0; + pInfo->nBackfillAttempted = 0; pInfo->aReadMark[1] = 0; for(i=2; iaReadMark[i] = READMARK_NOT_USED; assert( pInfo->aReadMark[0]==0 ); @@ -1735,6 +1757,7 @@ static int walCheckpoint( ** cannot be backfilled from the WAL. */ mxSafeFrame = pWal->hdr.mxFrame; + pInfo->nBackfillAttempted = mxSafeFrame; mxPage = pWal->hdr.nPage; for(i=1; ireadOnly & WAL_SHM_RDONLY)==0 - && (mxReadMarkreadOnly & WAL_SHM_RDONLY)==0 + && (mxReadMarkpSnapshot==0 + && pWal->pSnapshot==0 #endif - ){ - for(i=1; iaReadMark[i] = mxFrame; - mxI = i; - walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); - break; - }else if( rc!=SQLITE_BUSY ){ - return rc; - } + ){ + for(i=1; iaReadMark[i] = mxFrame; + mxI = i; + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + break; + }else if( rc!=SQLITE_BUSY ){ + return rc; } } - if( mxI==0 ){ + } + if( mxI==0 ){ #ifdef SQLITE_ENABLE_SNAPSHOT - if( pWal->pSnapshot ) return SQLITE_BUSY_SNAPSHOT; + if( pWal->pSnapshot ) return SQLITE_BUSY_SNAPSHOT; #endif - assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); - return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK; - } + assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); + return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK; + } - rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); - if( rc ){ - return rc==SQLITE_BUSY ? WAL_RETRY : rc; - } - /* Now that the read-lock has been obtained, check that neither the - ** value in the aReadMark[] array or the contents of the wal-index - ** header have changed. - ** - ** It is necessary to check that the wal-index header did not change - ** between the time it was read and when the shared-lock was obtained - ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility - ** that the log file may have been wrapped by a writer, or that frames - ** that occur later in the log than pWal->hdr.mxFrame may have been - ** copied into the database by a checkpointer. If either of these things - ** happened, then reading the database with the current value of - ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry - ** instead. - ** - ** Before checking that the live wal-index header has not changed - ** since it was read, set Wal.minFrame to the first frame in the wal - ** file that has not yet been checkpointed. This client will not need - ** to read any frames earlier than minFrame from the wal file - they - ** can be safely read directly from the database file. - ** - ** Because a ShmBarrier() call is made between taking the copy of - ** nBackfill and checking that the wal-header in shared-memory still - ** matches the one cached in pWal->hdr, it is guaranteed that the - ** checkpointer that set nBackfill was not working with a wal-index - ** header newer than that cached in pWal->hdr. If it were, that could - ** cause a problem. The checkpointer could omit to checkpoint - ** a version of page X that lies before pWal->minFrame (call that version - ** A) on the basis that there is a newer version (version B) of the same - ** page later in the wal file. But if version B happens to like past - ** frame pWal->hdr.mxFrame - then the client would incorrectly assume - ** that it can read version A from the database file. However, since - ** we can guarantee that the checkpointer that set nBackfill could not - ** see any pages past pWal->hdr.mxFrame, this problem does not come up. - */ - pWal->minFrame = pInfo->nBackfill+1; - walShmBarrier(pWal); - if( pInfo->aReadMark[mxI]!=mxReadMark - || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) - ){ - walUnlockShared(pWal, WAL_READ_LOCK(mxI)); - return WAL_RETRY; - }else{ - assert( mxReadMark<=pWal->hdr.mxFrame ); - pWal->readLock = (i16)mxI; - } + rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); + if( rc ){ + return rc==SQLITE_BUSY ? WAL_RETRY : rc; + } + /* Now that the read-lock has been obtained, check that neither the + ** value in the aReadMark[] array or the contents of the wal-index + ** header have changed. + ** + ** It is necessary to check that the wal-index header did not change + ** between the time it was read and when the shared-lock was obtained + ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility + ** that the log file may have been wrapped by a writer, or that frames + ** that occur later in the log than pWal->hdr.mxFrame may have been + ** copied into the database by a checkpointer. If either of these things + ** happened, then reading the database with the current value of + ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry + ** instead. + ** + ** Before checking that the live wal-index header has not changed + ** since it was read, set Wal.minFrame to the first frame in the wal + ** file that has not yet been checkpointed. This client will not need + ** to read any frames earlier than minFrame from the wal file - they + ** can be safely read directly from the database file. + ** + ** Because a ShmBarrier() call is made between taking the copy of + ** nBackfill and checking that the wal-header in shared-memory still + ** matches the one cached in pWal->hdr, it is guaranteed that the + ** checkpointer that set nBackfill was not working with a wal-index + ** header newer than that cached in pWal->hdr. If it were, that could + ** cause a problem. The checkpointer could omit to checkpoint + ** a version of page X that lies before pWal->minFrame (call that version + ** A) on the basis that there is a newer version (version B) of the same + ** page later in the wal file. But if version B happens to like past + ** frame pWal->hdr.mxFrame - then the client would incorrectly assume + ** that it can read version A from the database file. However, since + ** we can guarantee that the checkpointer that set nBackfill could not + ** see any pages past pWal->hdr.mxFrame, this problem does not come up. + */ + pWal->minFrame = pInfo->nBackfill+1; + walShmBarrier(pWal); + if( pInfo->aReadMark[mxI]!=mxReadMark + || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) + ){ + walUnlockShared(pWal, WAL_READ_LOCK(mxI)); + return WAL_RETRY; + }else{ + assert( mxReadMark<=pWal->hdr.mxFrame ); + pWal->readLock = (i16)mxI; } return rc; } @@ -2373,7 +2393,7 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ #ifdef SQLITE_ENABLE_SNAPSHOT int bChanged = 0; WalIndexHdr *pSnapshot = pWal->pSnapshot; - if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))){ + if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ bChanged = 1; } #endif @@ -2388,36 +2408,18 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ #ifdef SQLITE_ENABLE_SNAPSHOT if( rc==SQLITE_OK ){ - if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr)) ){ + if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ /* At this point the client has a lock on an aReadMark[] slot holding - ** a value equal to or smaller than pSnapshot->mxFrame. This client - ** did not populate the aReadMark[] slot. pWal->hdr is populated with - ** the wal-index header for the snapshot currently at the head of the - ** wal file, which is different from pSnapshot. + ** a value equal to or smaller than pSnapshot->mxFrame. Verify that + ** pSnapshot is still valid before continuing. Reasons why pSnapshot + ** might no longer be valid: ** - ** The presence of the aReadMark[] slot entry makes it very likely - ** that either there is currently another read-transaction open on - ** pSnapshot, or that there has been one more recently than the last - ** checkpoint of any frames greater than pSnapshot->mxFrame was - ** started. There is an exception though: client 1 may have called - ** walTryBeginRead and started to open snapshot pSnapshot, setting - ** the aReadMark[] slot to do so. At the same time, client 2 may - ** have committed a new snapshot to disk and started a checkpoint. - ** In this circumstance client 1 does not end up reading pSnapshot, - ** but may leave the aReadMark[] slot populated. + ** (1) The WAL file has been reset since the snapshot was taken. + ** In this case, the salt will have changed. ** - ** The race condition above is difficult to detect. One approach would - ** be to check the aReadMark[] slot for another client. But this is - ** prone to false-positives from other snapshot clients. And there - ** is no equivalent to xCheckReservedLock() for wal locks. Another - ** approach would be to take the checkpointer lock and check that - ** fewer than pSnapshot->mxFrame frames have been checkpointed. But - ** that does not account for checkpointer processes that failed after - ** checkpointing frames but before updating WalCkptInfo.nBackfill. - ** And it would mean that this function would block on checkpointers - ** and vice versa. - ** - ** TODO: For now, this race condition is ignored. + ** (2) A checkpoint as been attempted that wrote frames past + ** pSnapshot->mxFrame into the database file. Note that the + ** checkpoint need not have completed for this to cause problems. */ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); @@ -2428,8 +2430,8 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ ** overwrite pWal->hdr with *pSnapshot and set *pChanged as appropriate ** for opening the snapshot. Or, if the wal file has been wrapped ** since pSnapshot was written, return SQLITE_BUSY_SNAPSHOT. */ - if( pSnapshot->aSalt[0]==pWal->hdr.aSalt[0] - && pSnapshot->aSalt[1]==pWal->hdr.aSalt[1] + if( memcmp(pSnapshot->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt))==0 + && pSnapshot->mxFrame>=pInfo->nBackfillAttempted ){ memcpy(&pWal->hdr, pSnapshot, sizeof(WalIndexHdr)); *pChanged = bChanged;