1be5051923
FossilOrigin-Name: cf6da4a52f7f9047e653ef2972e4c0910b29d7182d789a9e30225dc1849e8779
1983 lines
58 KiB
C
1983 lines
58 KiB
C
/*
|
|
** 2012-01-23
|
|
**
|
|
** The author disclaims copyright to this source code. In place of
|
|
** a legal notice, here is a blessing:
|
|
**
|
|
** May you do good and not evil.
|
|
** May you find forgiveness for yourself and forgive others.
|
|
** May you share freely, never taking more than you give.
|
|
**
|
|
*************************************************************************
|
|
**
|
|
** Utilities used to help multiple LSM clients to coexist within the
|
|
** same process space.
|
|
*/
|
|
#include "lsmInt.h"
|
|
|
|
/*
|
|
** Global data. All global variables used by code in this file are grouped
|
|
** into the following structure instance.
|
|
**
|
|
** pDatabase:
|
|
** Linked list of all Database objects allocated within this process.
|
|
** This list may not be traversed without holding the global mutex (see
|
|
** functions enterGlobalMutex() and leaveGlobalMutex()).
|
|
*/
|
|
static struct SharedData {
|
|
Database *pDatabase; /* Linked list of all Database objects */
|
|
} gShared;
|
|
|
|
/*
|
|
** Database structure. There is one such structure for each distinct
|
|
** database accessed by this process. They are stored in the singly linked
|
|
** list starting at global variable gShared.pDatabase. Database objects are
|
|
** reference counted. Once the number of connections to the associated
|
|
** database drops to zero, they are removed from the linked list and deleted.
|
|
**
|
|
** pFile:
|
|
** In multi-process mode, this file descriptor is used to obtain locks
|
|
** and to access shared-memory. In single process mode, its only job is
|
|
** to hold the exclusive lock on the file.
|
|
**
|
|
*/
|
|
struct Database {
|
|
/* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
|
|
char *zName; /* Canonical path to database file */
|
|
int nName; /* strlen(zName) */
|
|
int nDbRef; /* Number of associated lsm_db handles */
|
|
Database *pDbNext; /* Next Database structure in global list */
|
|
|
|
/* Protected by the local mutex (pClientMutex) */
|
|
int bReadonly; /* True if Database.pFile is read-only */
|
|
int bMultiProc; /* True if running in multi-process mode */
|
|
lsm_file *pFile; /* Used for locks/shm in multi-proc mode */
|
|
LsmFile *pLsmFile; /* List of deferred closes */
|
|
lsm_mutex *pClientMutex; /* Protects the apShmChunk[] and pConn */
|
|
int nShmChunk; /* Number of entries in apShmChunk[] array */
|
|
void **apShmChunk; /* Array of "shared" memory regions */
|
|
lsm_db *pConn; /* List of connections to this db. */
|
|
};
|
|
|
|
/*
|
|
** Functions to enter and leave the global mutex. This mutex is used
|
|
** to protect the global linked-list headed at gShared.pDatabase.
|
|
*/
|
|
static int enterGlobalMutex(lsm_env *pEnv){
|
|
lsm_mutex *p;
|
|
int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
|
|
if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
|
|
return rc;
|
|
}
|
|
static void leaveGlobalMutex(lsm_env *pEnv){
|
|
lsm_mutex *p;
|
|
lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
|
|
lsmMutexLeave(pEnv, p);
|
|
}
|
|
|
|
#ifdef LSM_DEBUG
|
|
static int holdingGlobalMutex(lsm_env *pEnv){
|
|
lsm_mutex *p;
|
|
lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
|
|
return lsmMutexHeld(pEnv, p);
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
static void assertNotInFreelist(Freelist *p, int iBlk){
|
|
int i;
|
|
for(i=0; i<p->nEntry; i++){
|
|
assert( p->aEntry[i].iBlk!=iBlk );
|
|
}
|
|
}
|
|
#else
|
|
# define assertNotInFreelist(x,y)
|
|
#endif
|
|
|
|
/*
|
|
** Append an entry to the free-list. If (iId==-1), this is a delete.
|
|
*/
|
|
int freelistAppend(lsm_db *db, u32 iBlk, i64 iId){
|
|
lsm_env *pEnv = db->pEnv;
|
|
Freelist *p;
|
|
int i;
|
|
|
|
assert( iId==-1 || iId>=0 );
|
|
p = db->bUseFreelist ? db->pFreelist : &db->pWorker->freelist;
|
|
|
|
/* Extend the space allocated for the freelist, if required */
|
|
assert( p->nAlloc>=p->nEntry );
|
|
if( p->nAlloc==p->nEntry ){
|
|
int nNew;
|
|
int nByte;
|
|
FreelistEntry *aNew;
|
|
|
|
nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2);
|
|
nByte = sizeof(FreelistEntry) * nNew;
|
|
aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry, nByte);
|
|
if( !aNew ) return LSM_NOMEM_BKPT;
|
|
p->nAlloc = nNew;
|
|
p->aEntry = aNew;
|
|
}
|
|
|
|
for(i=0; i<p->nEntry; i++){
|
|
assert( i==0 || p->aEntry[i].iBlk > p->aEntry[i-1].iBlk );
|
|
if( p->aEntry[i].iBlk>=iBlk ) break;
|
|
}
|
|
|
|
if( i<p->nEntry && p->aEntry[i].iBlk==iBlk ){
|
|
/* Clobber an existing entry */
|
|
p->aEntry[i].iId = iId;
|
|
}else{
|
|
/* Insert a new entry into the list */
|
|
int nByte = sizeof(FreelistEntry)*(p->nEntry-i);
|
|
memmove(&p->aEntry[i+1], &p->aEntry[i], nByte);
|
|
p->aEntry[i].iBlk = iBlk;
|
|
p->aEntry[i].iId = iId;
|
|
p->nEntry++;
|
|
}
|
|
|
|
return LSM_OK;
|
|
}
|
|
|
|
/*
|
|
** This function frees all resources held by the Database structure passed
|
|
** as the only argument.
|
|
*/
|
|
static void freeDatabase(lsm_env *pEnv, Database *p){
|
|
assert( holdingGlobalMutex(pEnv) );
|
|
if( p ){
|
|
/* Free the mutexes */
|
|
lsmMutexDel(pEnv, p->pClientMutex);
|
|
|
|
if( p->pFile ){
|
|
lsmEnvClose(pEnv, p->pFile);
|
|
}
|
|
|
|
/* Free the array of shm pointers */
|
|
lsmFree(pEnv, p->apShmChunk);
|
|
|
|
/* Free the memory allocated for the Database struct itself */
|
|
lsmFree(pEnv, p);
|
|
}
|
|
}
|
|
|
|
typedef struct DbTruncateCtx DbTruncateCtx;
|
|
struct DbTruncateCtx {
|
|
int nBlock;
|
|
i64 iInUse;
|
|
};
|
|
|
|
static int dbTruncateCb(void *pCtx, int iBlk, i64 iSnapshot){
|
|
DbTruncateCtx *p = (DbTruncateCtx *)pCtx;
|
|
if( iBlk!=p->nBlock || (p->iInUse>=0 && iSnapshot>=p->iInUse) ) return 1;
|
|
p->nBlock--;
|
|
return 0;
|
|
}
|
|
|
|
static int dbTruncate(lsm_db *pDb, i64 iInUse){
|
|
int rc = LSM_OK;
|
|
#if 0
|
|
int i;
|
|
DbTruncateCtx ctx;
|
|
|
|
assert( pDb->pWorker );
|
|
ctx.nBlock = pDb->pWorker->nBlock;
|
|
ctx.iInUse = iInUse;
|
|
|
|
rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
|
|
for(i=ctx.nBlock+1; rc==LSM_OK && i<=pDb->pWorker->nBlock; i++){
|
|
rc = freelistAppend(pDb, i, -1);
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
#ifdef LSM_LOG_FREELIST
|
|
if( ctx.nBlock!=pDb->pWorker->nBlock ){
|
|
lsmLogMessage(pDb, 0,
|
|
"dbTruncate(): truncated db to %d blocks",ctx.nBlock
|
|
);
|
|
}
|
|
#endif
|
|
pDb->pWorker->nBlock = ctx.nBlock;
|
|
}
|
|
#endif
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** This function is called during database shutdown (when the number of
|
|
** connections drops from one to zero). It truncates the database file
|
|
** to as small a size as possible without truncating away any blocks that
|
|
** contain data.
|
|
*/
|
|
static int dbTruncateFile(lsm_db *pDb){
|
|
int rc;
|
|
|
|
assert( pDb->pWorker==0 );
|
|
assert( lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL) );
|
|
rc = lsmCheckpointLoadWorker(pDb);
|
|
|
|
if( rc==LSM_OK ){
|
|
DbTruncateCtx ctx;
|
|
|
|
/* Walk the database free-block-list in reverse order. Set ctx.nBlock
|
|
** to the block number of the last block in the database that actually
|
|
** contains data. */
|
|
ctx.nBlock = pDb->pWorker->nBlock;
|
|
ctx.iInUse = -1;
|
|
rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
|
|
|
|
/* If the last block that contains data is not already the last block in
|
|
** the database file, truncate the database file so that it is. */
|
|
if( rc==LSM_OK && ctx.nBlock!=pDb->pWorker->nBlock ){
|
|
rc = lsmFsTruncateDb(
|
|
pDb->pFS, (i64)ctx.nBlock*lsmFsBlockSize(pDb->pFS)
|
|
);
|
|
}
|
|
}
|
|
|
|
lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
|
|
pDb->pWorker = 0;
|
|
return rc;
|
|
}
|
|
|
|
static void doDbDisconnect(lsm_db *pDb){
|
|
int rc;
|
|
|
|
if( pDb->bReadonly ){
|
|
lsmShmLock(pDb, LSM_LOCK_DMS3, LSM_LOCK_UNLOCK, 0);
|
|
}else{
|
|
/* Block for an exclusive lock on DMS1. This lock serializes all calls
|
|
** to doDbConnect() and doDbDisconnect() across all processes. */
|
|
rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
|
|
if( rc==LSM_OK ){
|
|
|
|
lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
|
|
|
|
/* Try an exclusive lock on DMS2. If successful, this is the last
|
|
** connection to the database. In this case flush the contents of the
|
|
** in-memory tree to disk and write a checkpoint. */
|
|
rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 1, LSM_LOCK_EXCL);
|
|
if( rc==LSM_OK ){
|
|
rc = lsmShmTestLock(pDb, LSM_LOCK_CHECKPOINTER, 1, LSM_LOCK_EXCL);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
int bReadonly = 0; /* True if there exist read-only conns. */
|
|
|
|
/* Flush the in-memory tree, if required. If there is data to flush,
|
|
** this will create a new client snapshot in Database.pClient. The
|
|
** checkpoint (serialization) of this snapshot may be written to disk
|
|
** by the following block.
|
|
**
|
|
** There is no need to take a WRITER lock here. That there are no
|
|
** other locks on DMS2 guarantees that there are no other read-write
|
|
** connections at this time (and the lock on DMS1 guarantees that
|
|
** no new ones may appear).
|
|
*/
|
|
rc = lsmTreeLoadHeader(pDb, 0);
|
|
if( rc==LSM_OK && (lsmTreeHasOld(pDb) || lsmTreeSize(pDb)>0) ){
|
|
rc = lsmFlushTreeToDisk(pDb);
|
|
}
|
|
|
|
/* Now check if there are any read-only connections. If there are,
|
|
** then do not truncate the db file or unlink the shared-memory
|
|
** region. */
|
|
if( rc==LSM_OK ){
|
|
rc = lsmShmTestLock(pDb, LSM_LOCK_DMS3, 1, LSM_LOCK_EXCL);
|
|
if( rc==LSM_BUSY ){
|
|
bReadonly = 1;
|
|
rc = LSM_OK;
|
|
}
|
|
}
|
|
|
|
/* Write a checkpoint to disk. */
|
|
if( rc==LSM_OK ){
|
|
rc = lsmCheckpointWrite(pDb, (bReadonly==0), 0);
|
|
}
|
|
|
|
/* If the checkpoint was written successfully, delete the log file
|
|
** and, if possible, truncate the database file. */
|
|
if( rc==LSM_OK ){
|
|
int bRotrans = 0;
|
|
Database *p = pDb->pDatabase;
|
|
|
|
/* The log file may only be deleted if there are no clients
|
|
** read-only clients running rotrans transactions. */
|
|
rc = lsmDetectRoTrans(pDb, &bRotrans);
|
|
if( rc==LSM_OK && bRotrans==0 ){
|
|
lsmFsCloseAndDeleteLog(pDb->pFS);
|
|
}
|
|
|
|
/* The database may only be truncated if there exist no read-only
|
|
** clients - either connected or running rotrans transactions. */
|
|
if( bReadonly==0 && bRotrans==0 ){
|
|
dbTruncateFile(pDb);
|
|
if( p->pFile && p->bMultiProc ){
|
|
lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if( pDb->iRwclient>=0 ){
|
|
lsmShmLock(pDb, LSM_LOCK_RWCLIENT(pDb->iRwclient), LSM_LOCK_UNLOCK, 0);
|
|
pDb->iRwclient = -1;
|
|
}
|
|
|
|
lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
|
|
}
|
|
pDb->pShmhdr = 0;
|
|
}
|
|
|
|
static int doDbConnect(lsm_db *pDb){
|
|
const int nUsMax = 100000; /* Max value for nUs */
|
|
int nUs = 1000; /* us to wait between DMS1 attempts */
|
|
int rc;
|
|
|
|
/* Obtain a pointer to the shared-memory header */
|
|
assert( pDb->pShmhdr==0 );
|
|
assert( pDb->bReadonly==0 );
|
|
rc = lsmShmCacheChunks(pDb, 1);
|
|
if( rc!=LSM_OK ) return rc;
|
|
pDb->pShmhdr = (ShmHeader *)pDb->apShm[0];
|
|
|
|
/* Block for an exclusive lock on DMS1. This lock serializes all calls
|
|
** to doDbConnect() and doDbDisconnect() across all processes. */
|
|
while( 1 ){
|
|
rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
|
|
if( rc!=LSM_BUSY ) break;
|
|
lsmEnvSleep(pDb->pEnv, nUs);
|
|
nUs = nUs * 2;
|
|
if( nUs>nUsMax ) nUs = nUsMax;
|
|
}
|
|
if( rc!=LSM_OK ){
|
|
pDb->pShmhdr = 0;
|
|
return rc;
|
|
}
|
|
|
|
/* Try an exclusive lock on DMS2/DMS3. If successful, this is the first
|
|
** and only connection to the database. In this case initialize the
|
|
** shared-memory and run log file recovery. */
|
|
assert( LSM_LOCK_DMS3==1+LSM_LOCK_DMS2 );
|
|
rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 2, LSM_LOCK_EXCL);
|
|
if( rc==LSM_OK ){
|
|
memset(pDb->pShmhdr, 0, sizeof(ShmHeader));
|
|
rc = lsmCheckpointRecover(pDb);
|
|
if( rc==LSM_OK ){
|
|
rc = lsmLogRecover(pDb);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
ShmHeader *pShm = pDb->pShmhdr;
|
|
pShm->aReader[0].iLsmId = lsmCheckpointId(pShm->aSnap1, 0);
|
|
pShm->aReader[0].iTreeId = pDb->treehdr.iUsedShmid;
|
|
}
|
|
}else if( rc==LSM_BUSY ){
|
|
rc = LSM_OK;
|
|
}
|
|
|
|
/* Take a shared lock on DMS2. In multi-process mode this lock "cannot"
|
|
** fail, as connections may only hold an exclusive lock on DMS2 if they
|
|
** first hold an exclusive lock on DMS1. And this connection is currently
|
|
** holding the exclusive lock on DSM1.
|
|
**
|
|
** However, if some other connection has the database open in single-process
|
|
** mode, this operation will fail. In this case, return the error to the
|
|
** caller - the attempt to connect to the db has failed.
|
|
*/
|
|
if( rc==LSM_OK ){
|
|
rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
|
|
}
|
|
|
|
/* If anything went wrong, unlock DMS2. Otherwise, try to take an exclusive
|
|
** lock on one of the LSM_LOCK_RWCLIENT() locks. Unlock DMS1 in any case. */
|
|
if( rc!=LSM_OK ){
|
|
pDb->pShmhdr = 0;
|
|
}else{
|
|
int i;
|
|
for(i=0; i<LSM_LOCK_NRWCLIENT; i++){
|
|
int rc2 = lsmShmLock(pDb, LSM_LOCK_RWCLIENT(i), LSM_LOCK_EXCL, 0);
|
|
if( rc2==LSM_OK ) pDb->iRwclient = i;
|
|
if( rc2!=LSM_BUSY ){
|
|
rc = rc2;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int dbOpenSharedFd(lsm_env *pEnv, Database *p, int bRoOk){
|
|
int rc;
|
|
|
|
rc = lsmEnvOpen(pEnv, p->zName, 0, &p->pFile);
|
|
if( rc==LSM_IOERR && bRoOk ){
|
|
rc = lsmEnvOpen(pEnv, p->zName, LSM_OPEN_READONLY, &p->pFile);
|
|
p->bReadonly = 1;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Return a reference to the shared Database handle for the database
|
|
** identified by canonical path zName. If this is the first connection to
|
|
** the named database, a new Database object is allocated. Otherwise, a
|
|
** pointer to an existing object is returned.
|
|
**
|
|
** If successful, *ppDatabase is set to point to the shared Database
|
|
** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL
|
|
** and and LSM error code returned.
|
|
**
|
|
** Each successful call to this function should be (eventually) matched
|
|
** by a call to lsmDbDatabaseRelease().
|
|
*/
|
|
int lsmDbDatabaseConnect(
|
|
lsm_db *pDb, /* Database handle */
|
|
const char *zName /* Full-path to db file */
|
|
){
|
|
lsm_env *pEnv = pDb->pEnv;
|
|
int rc; /* Return code */
|
|
Database *p = 0; /* Pointer returned via *ppDatabase */
|
|
int nName = lsmStrlen(zName);
|
|
|
|
assert( pDb->pDatabase==0 );
|
|
rc = enterGlobalMutex(pEnv);
|
|
if( rc==LSM_OK ){
|
|
|
|
/* Search the global list for an existing object. TODO: Need something
|
|
** better than the memcmp() below to figure out if a given Database
|
|
** object represents the requested file. */
|
|
for(p=gShared.pDatabase; p; p=p->pDbNext){
|
|
if( nName==p->nName && 0==memcmp(zName, p->zName, nName) ) break;
|
|
}
|
|
|
|
/* If no suitable Database object was found, allocate a new one. */
|
|
if( p==0 ){
|
|
p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nName+1, &rc);
|
|
|
|
/* If the allocation was successful, fill in other fields and
|
|
** allocate the client mutex. */
|
|
if( rc==LSM_OK ){
|
|
p->bMultiProc = pDb->bMultiProc;
|
|
p->zName = (char *)&p[1];
|
|
p->nName = nName;
|
|
memcpy((void *)p->zName, zName, nName+1);
|
|
rc = lsmMutexNew(pEnv, &p->pClientMutex);
|
|
}
|
|
|
|
/* If nothing has gone wrong so far, open the shared fd. And if that
|
|
** succeeds and this connection requested single-process mode,
|
|
** attempt to take the exclusive lock on DMS2. */
|
|
if( rc==LSM_OK ){
|
|
int bReadonly = (pDb->bReadonly && pDb->bMultiProc);
|
|
rc = dbOpenSharedFd(pDb->pEnv, p, bReadonly);
|
|
}
|
|
|
|
if( rc==LSM_OK && p->bMultiProc==0 ){
|
|
/* Hold an exclusive lock DMS1 while grabbing DMS2. This ensures
|
|
** that any ongoing call to doDbDisconnect() (even one in another
|
|
** process) is finished before proceeding. */
|
|
assert( p->bReadonly==0 );
|
|
rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_EXCL);
|
|
if( rc==LSM_OK ){
|
|
rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS2, LSM_LOCK_EXCL);
|
|
lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK);
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
p->pDbNext = gShared.pDatabase;
|
|
gShared.pDatabase = p;
|
|
}else{
|
|
freeDatabase(pEnv, p);
|
|
p = 0;
|
|
}
|
|
}
|
|
|
|
if( p ){
|
|
p->nDbRef++;
|
|
}
|
|
leaveGlobalMutex(pEnv);
|
|
|
|
if( p ){
|
|
lsmMutexEnter(pDb->pEnv, p->pClientMutex);
|
|
pDb->pNext = p->pConn;
|
|
p->pConn = pDb;
|
|
lsmMutexLeave(pDb->pEnv, p->pClientMutex);
|
|
}
|
|
}
|
|
|
|
pDb->pDatabase = p;
|
|
if( rc==LSM_OK ){
|
|
assert( p );
|
|
rc = lsmFsOpen(pDb, zName, p->bReadonly);
|
|
}
|
|
|
|
/* If the db handle is read-write, then connect to the system now. Run
|
|
** recovery as necessary. Or, if this is a read-only database handle,
|
|
** defer attempting to connect to the system until a read-transaction
|
|
** is opened. */
|
|
if( pDb->bReadonly==0 ){
|
|
if( rc==LSM_OK ){
|
|
rc = lsmFsConfigure(pDb);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = doDbConnect(pDb);
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void dbDeferClose(lsm_db *pDb){
|
|
if( pDb->pFS ){
|
|
LsmFile *pLsmFile;
|
|
Database *p = pDb->pDatabase;
|
|
pLsmFile = lsmFsDeferClose(pDb->pFS);
|
|
pLsmFile->pNext = p->pLsmFile;
|
|
p->pLsmFile = pLsmFile;
|
|
}
|
|
}
|
|
|
|
LsmFile *lsmDbRecycleFd(lsm_db *db){
|
|
LsmFile *pRet;
|
|
Database *p = db->pDatabase;
|
|
lsmMutexEnter(db->pEnv, p->pClientMutex);
|
|
if( (pRet = p->pLsmFile)!=0 ){
|
|
p->pLsmFile = pRet->pNext;
|
|
}
|
|
lsmMutexLeave(db->pEnv, p->pClientMutex);
|
|
return pRet;
|
|
}
|
|
|
|
/*
|
|
** Release a reference to a Database object obtained from
|
|
** lsmDbDatabaseConnect(). There should be exactly one call to this function
|
|
** for each successful call to Find().
|
|
*/
|
|
void lsmDbDatabaseRelease(lsm_db *pDb){
|
|
Database *p = pDb->pDatabase;
|
|
if( p ){
|
|
lsm_db **ppDb;
|
|
|
|
if( pDb->pShmhdr ){
|
|
doDbDisconnect(pDb);
|
|
}
|
|
|
|
lsmMutexEnter(pDb->pEnv, p->pClientMutex);
|
|
for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext));
|
|
*ppDb = pDb->pNext;
|
|
dbDeferClose(pDb);
|
|
lsmMutexLeave(pDb->pEnv, p->pClientMutex);
|
|
|
|
enterGlobalMutex(pDb->pEnv);
|
|
p->nDbRef--;
|
|
if( p->nDbRef==0 ){
|
|
LsmFile *pIter;
|
|
LsmFile *pNext;
|
|
Database **pp;
|
|
|
|
/* Remove the Database structure from the linked list. */
|
|
for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
|
|
*pp = p->pDbNext;
|
|
|
|
/* If they were allocated from the heap, free the shared memory chunks */
|
|
if( p->bMultiProc==0 ){
|
|
int i;
|
|
for(i=0; i<p->nShmChunk; i++){
|
|
lsmFree(pDb->pEnv, p->apShmChunk[i]);
|
|
}
|
|
}
|
|
|
|
/* Close any outstanding file descriptors */
|
|
for(pIter=p->pLsmFile; pIter; pIter=pNext){
|
|
pNext = pIter->pNext;
|
|
lsmEnvClose(pDb->pEnv, pIter->pFile);
|
|
lsmFree(pDb->pEnv, pIter);
|
|
}
|
|
freeDatabase(pDb->pEnv, p);
|
|
}
|
|
leaveGlobalMutex(pDb->pEnv);
|
|
}
|
|
}
|
|
|
|
Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
|
|
return pSnapshot->pLevel;
|
|
}
|
|
|
|
void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
|
|
pSnap->pLevel = pLevel;
|
|
}
|
|
|
|
/* TODO: Shuffle things around to get rid of this */
|
|
static int firstSnapshotInUse(lsm_db *, i64 *);
|
|
|
|
/*
|
|
** Context object used by the lsmWalkFreelist() utility.
|
|
*/
|
|
typedef struct WalkFreelistCtx WalkFreelistCtx;
|
|
struct WalkFreelistCtx {
|
|
lsm_db *pDb;
|
|
int bReverse;
|
|
Freelist *pFreelist;
|
|
int iFree;
|
|
int (*xUsr)(void *, int, i64); /* User callback function */
|
|
void *pUsrctx; /* User callback context */
|
|
int bDone; /* Set to true after xUsr() returns true */
|
|
};
|
|
|
|
/*
|
|
** Callback used by lsmWalkFreelist().
|
|
*/
|
|
static int walkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
|
|
WalkFreelistCtx *p = (WalkFreelistCtx *)pCtx;
|
|
const int iDir = (p->bReverse ? -1 : 1);
|
|
Freelist *pFree = p->pFreelist;
|
|
|
|
assert( p->bDone==0 );
|
|
assert( iBlk>=0 );
|
|
if( pFree ){
|
|
while( (p->iFree < pFree->nEntry) && p->iFree>=0 ){
|
|
FreelistEntry *pEntry = &pFree->aEntry[p->iFree];
|
|
if( (p->bReverse==0 && pEntry->iBlk>(u32)iBlk)
|
|
|| (p->bReverse!=0 && pEntry->iBlk<(u32)iBlk)
|
|
){
|
|
break;
|
|
}else{
|
|
p->iFree += iDir;
|
|
if( pEntry->iId>=0
|
|
&& p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId)
|
|
){
|
|
p->bDone = 1;
|
|
return 1;
|
|
}
|
|
if( pEntry->iBlk==(u32)iBlk ) return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
if( p->xUsr(p->pUsrctx, iBlk, iSnapshot) ){
|
|
p->bDone = 1;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
** The database handle passed as the first argument must be the worker
|
|
** connection. This function iterates through the contents of the current
|
|
** free block list, invoking the supplied callback once for each list
|
|
** element.
|
|
**
|
|
** The difference between this function and lsmSortedWalkFreelist() is
|
|
** that lsmSortedWalkFreelist() only considers those free-list elements
|
|
** stored within the LSM. This function also merges in any in-memory
|
|
** elements.
|
|
*/
|
|
int lsmWalkFreelist(
|
|
lsm_db *pDb, /* Database handle (must be worker) */
|
|
int bReverse, /* True to iterate from largest to smallest */
|
|
int (*x)(void *, int, i64), /* Callback function */
|
|
void *pCtx /* First argument to pass to callback */
|
|
){
|
|
const int iDir = (bReverse ? -1 : 1);
|
|
int rc;
|
|
int iCtx;
|
|
|
|
WalkFreelistCtx ctx[2];
|
|
|
|
ctx[0].pDb = pDb;
|
|
ctx[0].bReverse = bReverse;
|
|
ctx[0].pFreelist = &pDb->pWorker->freelist;
|
|
if( ctx[0].pFreelist && bReverse ){
|
|
ctx[0].iFree = ctx[0].pFreelist->nEntry-1;
|
|
}else{
|
|
ctx[0].iFree = 0;
|
|
}
|
|
ctx[0].xUsr = walkFreelistCb;
|
|
ctx[0].pUsrctx = (void *)&ctx[1];
|
|
ctx[0].bDone = 0;
|
|
|
|
ctx[1].pDb = pDb;
|
|
ctx[1].bReverse = bReverse;
|
|
ctx[1].pFreelist = pDb->pFreelist;
|
|
if( ctx[1].pFreelist && bReverse ){
|
|
ctx[1].iFree = ctx[1].pFreelist->nEntry-1;
|
|
}else{
|
|
ctx[1].iFree = 0;
|
|
}
|
|
ctx[1].xUsr = x;
|
|
ctx[1].pUsrctx = pCtx;
|
|
ctx[1].bDone = 0;
|
|
|
|
rc = lsmSortedWalkFreelist(pDb, bReverse, walkFreelistCb, (void *)&ctx[0]);
|
|
|
|
if( ctx[0].bDone==0 ){
|
|
for(iCtx=0; iCtx<2; iCtx++){
|
|
int i;
|
|
WalkFreelistCtx *p = &ctx[iCtx];
|
|
for(i=p->iFree;
|
|
p->pFreelist && rc==LSM_OK && i<p->pFreelist->nEntry && i>=0;
|
|
i += iDir
|
|
){
|
|
FreelistEntry *pEntry = &p->pFreelist->aEntry[i];
|
|
if( pEntry->iId>=0 && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) ){
|
|
return LSM_OK;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
typedef struct FindFreeblockCtx FindFreeblockCtx;
|
|
struct FindFreeblockCtx {
|
|
i64 iInUse;
|
|
int iRet;
|
|
int bNotOne;
|
|
};
|
|
|
|
static int findFreeblockCb(void *pCtx, int iBlk, i64 iSnapshot){
|
|
FindFreeblockCtx *p = (FindFreeblockCtx *)pCtx;
|
|
if( iSnapshot<p->iInUse && (iBlk!=1 || p->bNotOne==0) ){
|
|
p->iRet = iBlk;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int findFreeblock(lsm_db *pDb, i64 iInUse, int bNotOne, int *piRet){
|
|
int rc; /* Return code */
|
|
FindFreeblockCtx ctx; /* Context object */
|
|
|
|
ctx.iInUse = iInUse;
|
|
ctx.iRet = 0;
|
|
ctx.bNotOne = bNotOne;
|
|
rc = lsmWalkFreelist(pDb, 0, findFreeblockCb, (void *)&ctx);
|
|
*piRet = ctx.iRet;
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Allocate a new database file block to write data to, either by extending
|
|
** the database file or by recycling a free-list entry. The worker snapshot
|
|
** must be held in order to call this function.
|
|
**
|
|
** If successful, *piBlk is set to the block number allocated and LSM_OK is
|
|
** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
|
|
*/
|
|
int lsmBlockAllocate(lsm_db *pDb, int iBefore, int *piBlk){
|
|
Snapshot *p = pDb->pWorker;
|
|
int iRet = 0; /* Block number of allocated block */
|
|
int rc = LSM_OK;
|
|
i64 iInUse = 0; /* Snapshot id still in use */
|
|
i64 iSynced = 0; /* Snapshot id synced to disk */
|
|
|
|
assert( p );
|
|
|
|
#ifdef LSM_LOG_FREELIST
|
|
{
|
|
static int nCall = 0;
|
|
char *zFree = 0;
|
|
nCall++;
|
|
rc = lsmInfoFreelist(pDb, &zFree);
|
|
if( rc!=LSM_OK ) return rc;
|
|
lsmLogMessage(pDb, 0, "lsmBlockAllocate(): %d freelist: %s", nCall, zFree);
|
|
lsmFree(pDb->pEnv, zFree);
|
|
}
|
|
#endif
|
|
|
|
/* Set iInUse to the smallest snapshot id that is either:
|
|
**
|
|
** * Currently in use by a database client,
|
|
** * May be used by a database client in the future, or
|
|
** * Is the most recently checkpointed snapshot (i.e. the one that will
|
|
** be used following recovery if a failure occurs at this point).
|
|
*/
|
|
rc = lsmCheckpointSynced(pDb, &iSynced, 0, 0);
|
|
if( rc==LSM_OK && iSynced==0 ) iSynced = p->iId;
|
|
iInUse = iSynced;
|
|
if( rc==LSM_OK && pDb->iReader>=0 ){
|
|
assert( pDb->pClient );
|
|
iInUse = LSM_MIN(iInUse, pDb->pClient->iId);
|
|
}
|
|
if( rc==LSM_OK ) rc = firstSnapshotInUse(pDb, &iInUse);
|
|
|
|
#ifdef LSM_LOG_FREELIST
|
|
{
|
|
lsmLogMessage(pDb, 0, "lsmBlockAllocate(): "
|
|
"snapshot-in-use: %lld (iSynced=%lld) (client-id=%lld)",
|
|
iInUse, iSynced, (pDb->iReader>=0 ? pDb->pClient->iId : 0)
|
|
);
|
|
}
|
|
#endif
|
|
|
|
|
|
/* Unless there exists a read-only transaction (which prevents us from
|
|
** recycling any blocks regardless, query the free block list for a
|
|
** suitable block to reuse.
|
|
**
|
|
** It might seem more natural to check for a read-only transaction at
|
|
** the start of this function. However, it is better do wait until after
|
|
** the call to lsmCheckpointSynced() to do so.
|
|
*/
|
|
if( rc==LSM_OK ){
|
|
int bRotrans;
|
|
rc = lsmDetectRoTrans(pDb, &bRotrans);
|
|
|
|
if( rc==LSM_OK && bRotrans==0 ){
|
|
rc = findFreeblock(pDb, iInUse, (iBefore>0), &iRet);
|
|
}
|
|
}
|
|
|
|
if( iBefore>0 && (iRet<=0 || iRet>=iBefore) ){
|
|
iRet = 0;
|
|
|
|
}else if( rc==LSM_OK ){
|
|
/* If a block was found in the free block list, use it and remove it from
|
|
** the list. Otherwise, if no suitable block was found, allocate one from
|
|
** the end of the file. */
|
|
if( iRet>0 ){
|
|
#ifdef LSM_LOG_FREELIST
|
|
lsmLogMessage(pDb, 0,
|
|
"reusing block %d (snapshot-in-use=%lld)", iRet, iInUse);
|
|
#endif
|
|
rc = freelistAppend(pDb, iRet, -1);
|
|
if( rc==LSM_OK ){
|
|
rc = dbTruncate(pDb, iInUse);
|
|
}
|
|
}else{
|
|
iRet = ++(p->nBlock);
|
|
#ifdef LSM_LOG_FREELIST
|
|
lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
assert( iBefore>0 || iRet>0 || rc!=LSM_OK );
|
|
*piBlk = iRet;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Free a database block. The worker snapshot must be held in order to call
|
|
** this function.
|
|
**
|
|
** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g.
|
|
** LSM_NOMEM).
|
|
*/
|
|
int lsmBlockFree(lsm_db *pDb, int iBlk){
|
|
Snapshot *p = pDb->pWorker;
|
|
assert( lsmShmAssertWorker(pDb) );
|
|
|
|
#ifdef LSM_LOG_FREELIST
|
|
lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk);
|
|
#endif
|
|
|
|
return freelistAppend(pDb, iBlk, p->iId);
|
|
}
|
|
|
|
/*
|
|
** Refree a database block. The worker snapshot must be held in order to call
|
|
** this function.
|
|
**
|
|
** Refreeing is required when a block is allocated using lsmBlockAllocate()
|
|
** but then not used. This function is used to push the block back onto
|
|
** the freelist. Refreeing a block is different from freeing is, as a refreed
|
|
** block may be reused immediately. Whereas a freed block can not be reused
|
|
** until (at least) after the next checkpoint.
|
|
*/
|
|
int lsmBlockRefree(lsm_db *pDb, int iBlk){
|
|
int rc = LSM_OK; /* Return code */
|
|
|
|
#ifdef LSM_LOG_FREELIST
|
|
lsmLogMessage(pDb, LSM_OK, "lsmBlockRefree(): Refree block %d", iBlk);
|
|
#endif
|
|
|
|
rc = freelistAppend(pDb, iBlk, 0);
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** If required, copy a database checkpoint from shared memory into the
|
|
** database itself.
|
|
**
|
|
** The WORKER lock must not be held when this is called. This is because
|
|
** this function may indirectly call fsync(). And the WORKER lock should
|
|
** not be held that long (in case it is required by a client flushing an
|
|
** in-memory tree to disk).
|
|
*/
|
|
int lsmCheckpointWrite(lsm_db *pDb, int bTruncate, u32 *pnWrite){
|
|
int rc; /* Return Code */
|
|
u32 nWrite = 0;
|
|
|
|
assert( pDb->pWorker==0 );
|
|
assert( 1 || pDb->pClient==0 );
|
|
assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) );
|
|
|
|
rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0);
|
|
if( rc!=LSM_OK ) return rc;
|
|
|
|
rc = lsmCheckpointLoad(pDb, 0);
|
|
if( rc==LSM_OK ){
|
|
int nBlock = lsmCheckpointNBlock(pDb->aSnapshot);
|
|
ShmHeader *pShm = pDb->pShmhdr;
|
|
int bDone = 0; /* True if checkpoint is already stored */
|
|
|
|
/* Check if this checkpoint has already been written to the database
|
|
** file. If so, set variable bDone to true. */
|
|
if( pShm->iMetaPage ){
|
|
MetaPage *pPg; /* Meta page */
|
|
u8 *aData; /* Meta-page data buffer */
|
|
int nData; /* Size of aData[] in bytes */
|
|
i64 iCkpt; /* Id of checkpoint just loaded */
|
|
i64 iDisk = 0; /* Id of checkpoint already stored in db */
|
|
iCkpt = lsmCheckpointId(pDb->aSnapshot, 0);
|
|
rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg);
|
|
if( rc==LSM_OK ){
|
|
aData = lsmFsMetaPageData(pPg, &nData);
|
|
iDisk = lsmCheckpointId((u32 *)aData, 1);
|
|
nWrite = lsmCheckpointNWrite((u32 *)aData, 1);
|
|
lsmFsMetaPageRelease(pPg);
|
|
}
|
|
bDone = (iDisk>=iCkpt);
|
|
}
|
|
|
|
if( rc==LSM_OK && bDone==0 ){
|
|
int iMeta = (pShm->iMetaPage % 2) + 1;
|
|
if( pDb->eSafety!=LSM_SAFETY_OFF ){
|
|
rc = lsmFsSyncDb(pDb->pFS, nBlock);
|
|
}
|
|
if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta);
|
|
if( rc==LSM_OK && pDb->eSafety!=LSM_SAFETY_OFF){
|
|
rc = lsmFsSyncDb(pDb->pFS, 0);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
pShm->iMetaPage = iMeta;
|
|
nWrite = lsmCheckpointNWrite(pDb->aSnapshot, 0) - nWrite;
|
|
}
|
|
#ifdef LSM_LOG_WORK
|
|
lsmLogMessage(pDb, 0, "finish checkpoint %d",
|
|
(int)lsmCheckpointId(pDb->aSnapshot, 0)
|
|
);
|
|
#endif
|
|
}
|
|
|
|
if( rc==LSM_OK && bTruncate && nBlock>0 ){
|
|
rc = lsmFsTruncateDb(pDb->pFS, (i64)nBlock*lsmFsBlockSize(pDb->pFS));
|
|
}
|
|
}
|
|
|
|
lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0);
|
|
if( pnWrite && rc==LSM_OK ) *pnWrite = nWrite;
|
|
return rc;
|
|
}
|
|
|
|
int lsmBeginWork(lsm_db *pDb){
|
|
int rc;
|
|
|
|
/* Attempt to take the WORKER lock */
|
|
rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
|
|
|
|
/* Deserialize the current worker snapshot */
|
|
if( rc==LSM_OK ){
|
|
rc = lsmCheckpointLoadWorker(pDb);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){
|
|
if( p ){
|
|
lsmSortedFreeLevel(pEnv, p->pLevel);
|
|
lsmFree(pEnv, p->freelist.aEntry);
|
|
lsmFree(pEnv, p->redirect.a);
|
|
lsmFree(pEnv, p);
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Attempt to populate one of the read-lock slots to contain lock values
|
|
** iLsm/iShm. Or, if such a slot exists already, this function is a no-op.
|
|
**
|
|
** It is not an error if no slot can be populated because the write-lock
|
|
** cannot be obtained. If any other error occurs, return an LSM error code.
|
|
** Otherwise, LSM_OK.
|
|
**
|
|
** This function is called at various points to try to ensure that there
|
|
** always exists at least one read-lock slot that can be used by a read-only
|
|
** client. And so that, in the usual case, there is an "exact match" available
|
|
** whenever a read transaction is opened by any client. At present this
|
|
** function is called when:
|
|
**
|
|
** * A write transaction that called lsmTreeDiscardOld() is committed, and
|
|
** * Whenever the working snapshot is updated (i.e. lsmFinishWork()).
|
|
*/
|
|
static int dbSetReadLock(lsm_db *db, i64 iLsm, u32 iShm){
|
|
int rc = LSM_OK;
|
|
ShmHeader *pShm = db->pShmhdr;
|
|
int i;
|
|
|
|
/* Check if there is already a slot containing the required values. */
|
|
for(i=0; i<LSM_LOCK_NREADER; i++){
|
|
ShmReader *p = &pShm->aReader[i];
|
|
if( p->iLsmId==iLsm && p->iTreeId==iShm ) return LSM_OK;
|
|
}
|
|
|
|
/* Iterate through all read-lock slots, attempting to take a write-lock
|
|
** on each of them. If a write-lock succeeds, populate the locked slot
|
|
** with the required values and break out of the loop. */
|
|
for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
|
|
rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
|
|
if( rc==LSM_BUSY ){
|
|
rc = LSM_OK;
|
|
}else{
|
|
ShmReader *p = &pShm->aReader[i];
|
|
p->iLsmId = iLsm;
|
|
p->iTreeId = iShm;
|
|
lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Release the read-lock currently held by connection db.
|
|
*/
|
|
int dbReleaseReadlock(lsm_db *db){
|
|
int rc = LSM_OK;
|
|
if( db->iReader>=0 ){
|
|
rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0);
|
|
db->iReader = -1;
|
|
}
|
|
db->bRoTrans = 0;
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** Argument bFlush is true if the contents of the in-memory tree has just
|
|
** been flushed to disk. The significance of this is that once the snapshot
|
|
** created to hold the updated state of the database is synced to disk, log
|
|
** file space can be recycled.
|
|
*/
|
|
void lsmFinishWork(lsm_db *pDb, int bFlush, int *pRc){
|
|
int rc = *pRc;
|
|
assert( rc!=0 || pDb->pWorker );
|
|
if( pDb->pWorker ){
|
|
/* If no error has occurred, serialize the worker snapshot and write
|
|
** it to shared memory. */
|
|
if( rc==LSM_OK ){
|
|
rc = lsmSaveWorker(pDb, bFlush);
|
|
}
|
|
|
|
/* Assuming no error has occurred, update a read lock slot with the
|
|
** new snapshot id (see comments above function dbSetReadLock()). */
|
|
if( rc==LSM_OK ){
|
|
if( pDb->iReader<0 ){
|
|
rc = lsmTreeLoadHeader(pDb, 0);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
rc = dbSetReadLock(pDb, pDb->pWorker->iId, pDb->treehdr.iUsedShmid);
|
|
}
|
|
}
|
|
|
|
/* Free the snapshot object. */
|
|
lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
|
|
pDb->pWorker = 0;
|
|
}
|
|
|
|
lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
|
|
*pRc = rc;
|
|
}
|
|
|
|
/*
|
|
** Called when recovery is finished.
|
|
*/
|
|
int lsmFinishRecovery(lsm_db *pDb){
|
|
lsmTreeEndTransaction(pDb, 1);
|
|
return LSM_OK;
|
|
}
|
|
|
|
/*
|
|
** Check if the currently configured compression functions
|
|
** (LSM_CONFIG_SET_COMPRESSION) are compatible with a database that has its
|
|
** compression id set to iReq. Compression routines are compatible if iReq
|
|
** is zero (indicating the database is empty), or if it is equal to the
|
|
** compression id of the configured compression routines.
|
|
**
|
|
** If the check shows that the current compression are incompatible and there
|
|
** is a compression factory registered, give it a chance to install new
|
|
** compression routines.
|
|
**
|
|
** If, after any registered factory is invoked, the compression functions
|
|
** are still incompatible, return LSM_MISMATCH. Otherwise, LSM_OK.
|
|
*/
|
|
int lsmCheckCompressionId(lsm_db *pDb, u32 iReq){
|
|
if( iReq!=LSM_COMPRESSION_EMPTY && pDb->compress.iId!=iReq ){
|
|
if( pDb->factory.xFactory ){
|
|
pDb->bInFactory = 1;
|
|
pDb->factory.xFactory(pDb->factory.pCtx, pDb, iReq);
|
|
pDb->bInFactory = 0;
|
|
}
|
|
if( pDb->compress.iId!=iReq ){
|
|
/* Incompatible */
|
|
return LSM_MISMATCH;
|
|
}
|
|
}
|
|
/* Compatible */
|
|
return LSM_OK;
|
|
}
|
|
|
|
/*
|
|
** Begin a read transaction. This function is a no-op if the connection
|
|
** passed as the only argument already has an open read transaction.
|
|
*/
|
|
int lsmBeginReadTrans(lsm_db *pDb){
|
|
const int MAX_READLOCK_ATTEMPTS = 10;
|
|
const int nMaxAttempt = (pDb->bRoTrans ? 1 : MAX_READLOCK_ATTEMPTS);
|
|
|
|
int rc = LSM_OK; /* Return code */
|
|
int iAttempt = 0;
|
|
|
|
assert( pDb->pWorker==0 );
|
|
|
|
while( rc==LSM_OK && pDb->iReader<0 && (iAttempt++)<nMaxAttempt ){
|
|
int iTreehdr = 0;
|
|
int iSnap = 0;
|
|
assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
|
|
|
|
/* Load the in-memory tree header. */
|
|
rc = lsmTreeLoadHeader(pDb, &iTreehdr);
|
|
|
|
/* Load the database snapshot */
|
|
if( rc==LSM_OK ){
|
|
if( lsmCheckpointClientCacheOk(pDb)==0 ){
|
|
lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
|
|
pDb->pClient = 0;
|
|
lsmMCursorFreeCache(pDb);
|
|
lsmFsPurgeCache(pDb->pFS);
|
|
rc = lsmCheckpointLoad(pDb, &iSnap);
|
|
}else{
|
|
iSnap = 1;
|
|
}
|
|
}
|
|
|
|
/* Take a read-lock on the tree and snapshot just loaded. Then check
|
|
** that the shared-memory still contains the same values. If so, proceed.
|
|
** Otherwise, relinquish the read-lock and retry the whole procedure
|
|
** (starting with loading the in-memory tree header). */
|
|
if( rc==LSM_OK ){
|
|
u32 iShmMax = pDb->treehdr.iUsedShmid;
|
|
u32 iShmMin = pDb->treehdr.iNextShmid+1-LSM_MAX_SHMCHUNKS;
|
|
rc = lsmReadlock(
|
|
pDb, lsmCheckpointId(pDb->aSnapshot, 0), iShmMin, iShmMax
|
|
);
|
|
if( rc==LSM_OK ){
|
|
if( lsmTreeLoadHeaderOk(pDb, iTreehdr)
|
|
&& lsmCheckpointLoadOk(pDb, iSnap)
|
|
){
|
|
/* Read lock has been successfully obtained. Deserialize the
|
|
** checkpoint just loaded. TODO: This will be removed after
|
|
** lsm_sorted.c is changed to work directly from the serialized
|
|
** version of the snapshot. */
|
|
if( pDb->pClient==0 ){
|
|
rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot,&pDb->pClient);
|
|
}
|
|
assert( (rc==LSM_OK)==(pDb->pClient!=0) );
|
|
assert( pDb->iReader>=0 );
|
|
|
|
/* Check that the client has the right compression hooks loaded.
|
|
** If not, set rc to LSM_MISMATCH. */
|
|
if( rc==LSM_OK ){
|
|
rc = lsmCheckCompressionId(pDb, pDb->pClient->iCmpId);
|
|
}
|
|
}else{
|
|
rc = dbReleaseReadlock(pDb);
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_BUSY ){
|
|
rc = LSM_OK;
|
|
}
|
|
}
|
|
#if 0
|
|
if( rc==LSM_OK && pDb->pClient ){
|
|
fprintf(stderr,
|
|
"reading %p: snapshot:%d used-shmid:%d trans-id:%d iOldShmid=%d\n",
|
|
(void *)pDb,
|
|
(int)pDb->pClient->iId, (int)pDb->treehdr.iUsedShmid,
|
|
(int)pDb->treehdr.root.iTransId,
|
|
(int)pDb->treehdr.iOldShmid
|
|
);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk);
|
|
}
|
|
if( rc!=LSM_OK ){
|
|
dbReleaseReadlock(pDb);
|
|
}
|
|
if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** This function is used by a read-write connection to determine if there
|
|
** are currently one or more read-only transactions open on the database
|
|
** (in this context a read-only transaction is one opened by a read-only
|
|
** connection on a non-live database).
|
|
**
|
|
** If no error occurs, LSM_OK is returned and *pbExists is set to true if
|
|
** some other connection has a read-only transaction open, or false
|
|
** otherwise. If an error occurs an LSM error code is returned and the final
|
|
** value of *pbExist is undefined.
|
|
*/
|
|
int lsmDetectRoTrans(lsm_db *db, int *pbExist){
|
|
int rc;
|
|
|
|
/* Only a read-write connection may use this function. */
|
|
assert( db->bReadonly==0 );
|
|
|
|
rc = lsmShmTestLock(db, LSM_LOCK_ROTRANS, 1, LSM_LOCK_EXCL);
|
|
if( rc==LSM_BUSY ){
|
|
*pbExist = 1;
|
|
rc = LSM_OK;
|
|
}else{
|
|
*pbExist = 0;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** db is a read-only database handle in the disconnected state. This function
|
|
** attempts to open a read-transaction on the database. This may involve
|
|
** connecting to the database system (opening shared memory etc.).
|
|
*/
|
|
int lsmBeginRoTrans(lsm_db *db){
|
|
int rc = LSM_OK;
|
|
|
|
assert( db->bReadonly && db->pShmhdr==0 );
|
|
assert( db->iReader<0 );
|
|
|
|
if( db->bRoTrans==0 ){
|
|
|
|
/* Attempt a shared-lock on DMS1. */
|
|
rc = lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_SHARED, 0);
|
|
if( rc!=LSM_OK ) return rc;
|
|
|
|
rc = lsmShmTestLock(
|
|
db, LSM_LOCK_RWCLIENT(0), LSM_LOCK_NREADER, LSM_LOCK_SHARED
|
|
);
|
|
if( rc==LSM_OK ){
|
|
/* System is not live. Take a SHARED lock on the ROTRANS byte and
|
|
** release DMS1. Locking ROTRANS tells all read-write clients that they
|
|
** may not recycle any disk space from within the database or log files,
|
|
** as a read-only client may be using it. */
|
|
rc = lsmShmLock(db, LSM_LOCK_ROTRANS, LSM_LOCK_SHARED, 0);
|
|
lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
|
|
|
|
if( rc==LSM_OK ){
|
|
db->bRoTrans = 1;
|
|
rc = lsmShmCacheChunks(db, 1);
|
|
if( rc==LSM_OK ){
|
|
db->pShmhdr = (ShmHeader *)db->apShm[0];
|
|
memset(db->pShmhdr, 0, sizeof(ShmHeader));
|
|
rc = lsmCheckpointRecover(db);
|
|
if( rc==LSM_OK ){
|
|
rc = lsmLogRecover(db);
|
|
}
|
|
}
|
|
}
|
|
}else if( rc==LSM_BUSY ){
|
|
/* System is live! */
|
|
rc = lsmShmLock(db, LSM_LOCK_DMS3, LSM_LOCK_SHARED, 0);
|
|
lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
|
|
if( rc==LSM_OK ){
|
|
rc = lsmShmCacheChunks(db, 1);
|
|
if( rc==LSM_OK ){
|
|
db->pShmhdr = (ShmHeader *)db->apShm[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = lsmBeginReadTrans(db);
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Close the currently open read transaction.
|
|
*/
|
|
void lsmFinishReadTrans(lsm_db *pDb){
|
|
|
|
/* Worker connections should not be closing read transactions. And
|
|
** read transactions should only be closed after all cursors and write
|
|
** transactions have been closed. Finally pClient should be non-NULL
|
|
** only iff pDb->iReader>=0. */
|
|
assert( pDb->pWorker==0 );
|
|
assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
|
|
|
|
if( pDb->bRoTrans ){
|
|
int i;
|
|
for(i=0; i<pDb->nShm; i++){
|
|
lsmFree(pDb->pEnv, pDb->apShm[i]);
|
|
}
|
|
lsmFree(pDb->pEnv, pDb->apShm);
|
|
pDb->apShm = 0;
|
|
pDb->nShm = 0;
|
|
pDb->pShmhdr = 0;
|
|
|
|
lsmShmLock(pDb, LSM_LOCK_ROTRANS, LSM_LOCK_UNLOCK, 0);
|
|
}
|
|
dbReleaseReadlock(pDb);
|
|
}
|
|
|
|
/*
|
|
** Open a write transaction.
|
|
*/
|
|
int lsmBeginWriteTrans(lsm_db *pDb){
|
|
int rc = LSM_OK; /* Return code */
|
|
ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */
|
|
|
|
assert( pDb->nTransOpen==0 );
|
|
assert( pDb->bDiscardOld==0 );
|
|
assert( pDb->bReadonly==0 );
|
|
|
|
/* If there is no read-transaction open, open one now. */
|
|
if( pDb->iReader<0 ){
|
|
rc = lsmBeginReadTrans(pDb);
|
|
}
|
|
|
|
/* Attempt to take the WRITER lock */
|
|
if( rc==LSM_OK ){
|
|
rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
|
|
}
|
|
|
|
/* If the previous writer failed mid-transaction, run emergency rollback. */
|
|
if( rc==LSM_OK && pShm->bWriter ){
|
|
rc = lsmTreeRepair(pDb);
|
|
if( rc==LSM_OK ) pShm->bWriter = 0;
|
|
}
|
|
|
|
/* Check that this connection is currently reading from the most recent
|
|
** version of the database. If not, return LSM_BUSY. */
|
|
if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){
|
|
rc = LSM_BUSY;
|
|
}
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = lsmLogBegin(pDb);
|
|
}
|
|
|
|
/* If everything was successful, set the "transaction-in-progress" flag
|
|
** and return LSM_OK. Otherwise, if some error occurred, relinquish the
|
|
** WRITER lock and return an error code. */
|
|
if( rc==LSM_OK ){
|
|
TreeHeader *p = &pDb->treehdr;
|
|
pShm->bWriter = 1;
|
|
p->root.iTransId++;
|
|
if( lsmTreeHasOld(pDb) && p->iOldLog==pDb->pClient->iLogOff ){
|
|
lsmTreeDiscardOld(pDb);
|
|
pDb->bDiscardOld = 1;
|
|
}
|
|
}else{
|
|
lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
|
|
if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** End the current write transaction. The connection is left with an open
|
|
** read transaction. It is an error to call this if there is no open write
|
|
** transaction.
|
|
**
|
|
** If the transaction was committed, then a commit record has already been
|
|
** written into the log file when this function is called. Or, if the
|
|
** transaction was rolled back, both the log file and in-memory tree
|
|
** structure have already been restored. In either case, this function
|
|
** merely releases locks and other resources held by the write-transaction.
|
|
**
|
|
** LSM_OK is returned if successful, or an LSM error code otherwise.
|
|
*/
|
|
int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
|
|
int rc = LSM_OK;
|
|
int bFlush = 0;
|
|
|
|
lsmLogEnd(pDb, bCommit);
|
|
if( rc==LSM_OK && bCommit && lsmTreeSize(pDb)>pDb->nTreeLimit ){
|
|
bFlush = 1;
|
|
lsmTreeMakeOld(pDb);
|
|
}
|
|
lsmTreeEndTransaction(pDb, bCommit);
|
|
|
|
if( rc==LSM_OK ){
|
|
if( bFlush && pDb->bAutowork ){
|
|
rc = lsmSortedAutoWork(pDb, 1);
|
|
}else if( bCommit && pDb->bDiscardOld ){
|
|
rc = dbSetReadLock(pDb, pDb->pClient->iId, pDb->treehdr.iUsedShmid);
|
|
}
|
|
}
|
|
pDb->bDiscardOld = 0;
|
|
lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
|
|
|
|
if( bFlush && pDb->bAutowork==0 && pDb->xWork ){
|
|
pDb->xWork(pDb, pDb->pWorkCtx);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** Return non-zero if the caller is holding the client mutex.
|
|
*/
|
|
#ifdef LSM_DEBUG
|
|
int lsmHoldingClientMutex(lsm_db *pDb){
|
|
return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
|
|
}
|
|
#endif
|
|
|
|
static int slotIsUsable(ShmReader *p, i64 iLsm, u32 iShmMin, u32 iShmMax){
|
|
return(
|
|
p->iLsmId && p->iLsmId<=iLsm
|
|
&& shm_sequence_ge(iShmMax, p->iTreeId)
|
|
&& shm_sequence_ge(p->iTreeId, iShmMin)
|
|
);
|
|
}
|
|
|
|
/*
|
|
** Obtain a read-lock on database version identified by the combination
|
|
** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
|
|
** an LSM error code otherwise.
|
|
*/
|
|
int lsmReadlock(lsm_db *db, i64 iLsm, u32 iShmMin, u32 iShmMax){
|
|
int rc = LSM_OK;
|
|
ShmHeader *pShm = db->pShmhdr;
|
|
int i;
|
|
|
|
assert( db->iReader<0 );
|
|
assert( shm_sequence_ge(iShmMax, iShmMin) );
|
|
|
|
/* This is a no-op if the read-only transaction flag is set. */
|
|
if( db->bRoTrans ){
|
|
db->iReader = 0;
|
|
return LSM_OK;
|
|
}
|
|
|
|
/* Search for an exact match. */
|
|
for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
|
|
ShmReader *p = &pShm->aReader[i];
|
|
if( p->iLsmId==iLsm && p->iTreeId==iShmMax ){
|
|
rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
|
|
if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iShmMax ){
|
|
db->iReader = i;
|
|
}else if( rc==LSM_BUSY ){
|
|
rc = LSM_OK;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Try to obtain a write-lock on each slot, in order. If successful, set
|
|
** the slot values to iLsm/iTree. */
|
|
for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
|
|
rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
|
|
if( rc==LSM_BUSY ){
|
|
rc = LSM_OK;
|
|
}else{
|
|
ShmReader *p = &pShm->aReader[i];
|
|
p->iLsmId = iLsm;
|
|
p->iTreeId = iShmMax;
|
|
rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
|
|
assert( rc!=LSM_BUSY );
|
|
if( rc==LSM_OK ) db->iReader = i;
|
|
}
|
|
}
|
|
|
|
/* Search for any usable slot */
|
|
for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
|
|
ShmReader *p = &pShm->aReader[i];
|
|
if( slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
|
|
rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
|
|
if( rc==LSM_OK && slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
|
|
db->iReader = i;
|
|
}else if( rc==LSM_BUSY ){
|
|
rc = LSM_OK;
|
|
}
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_OK && db->iReader<0 ){
|
|
rc = LSM_BUSY;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** This is used to check if there exists a read-lock locking a particular
|
|
** version of either the in-memory tree or database file.
|
|
**
|
|
** If iLsmId is non-zero, then it is a snapshot id. If there exists a
|
|
** read-lock using this snapshot or newer, set *pbInUse to true. Or,
|
|
** if there is no such read-lock, set it to false.
|
|
**
|
|
** Or, if iLsmId is zero, then iShmid is a shared-memory sequence id.
|
|
** Search for a read-lock using this sequence id or newer. etc.
|
|
*/
|
|
static int isInUse(lsm_db *db, i64 iLsmId, u32 iShmid, int *pbInUse){
|
|
ShmHeader *pShm = db->pShmhdr;
|
|
int i;
|
|
int rc = LSM_OK;
|
|
|
|
for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
|
|
ShmReader *p = &pShm->aReader[i];
|
|
if( p->iLsmId ){
|
|
if( (iLsmId!=0 && p->iLsmId!=0 && iLsmId>=p->iLsmId)
|
|
|| (iLsmId==0 && shm_sequence_ge(p->iTreeId, iShmid))
|
|
){
|
|
rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
|
|
if( rc==LSM_OK ){
|
|
p->iLsmId = 0;
|
|
lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if( rc==LSM_BUSY ){
|
|
*pbInUse = 1;
|
|
return LSM_OK;
|
|
}
|
|
*pbInUse = 0;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** This function is called by worker connections to determine the smallest
|
|
** snapshot id that is currently in use by a database client. The worker
|
|
** connection uses this result to determine whether or not it is safe to
|
|
** recycle a database block.
|
|
*/
|
|
static int firstSnapshotInUse(
|
|
lsm_db *db, /* Database handle */
|
|
i64 *piInUse /* IN/OUT: Smallest snapshot id in use */
|
|
){
|
|
ShmHeader *pShm = db->pShmhdr;
|
|
i64 iInUse = *piInUse;
|
|
int i;
|
|
|
|
assert( iInUse>0 );
|
|
for(i=0; i<LSM_LOCK_NREADER; i++){
|
|
ShmReader *p = &pShm->aReader[i];
|
|
if( p->iLsmId ){
|
|
i64 iThis = p->iLsmId;
|
|
if( iThis!=0 && iInUse>iThis ){
|
|
int rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
|
|
if( rc==LSM_OK ){
|
|
p->iLsmId = 0;
|
|
lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
|
|
}else if( rc==LSM_BUSY ){
|
|
iInUse = iThis;
|
|
}else{
|
|
/* Some error other than LSM_BUSY. Return the error code to
|
|
** the caller in this case. */
|
|
return rc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
*piInUse = iInUse;
|
|
return LSM_OK;
|
|
}
|
|
|
|
int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){
|
|
if( db->treehdr.iUsedShmid==iShmid ){
|
|
*pbInUse = 1;
|
|
return LSM_OK;
|
|
}
|
|
return isInUse(db, 0, iShmid, pbInUse);
|
|
}
|
|
|
|
int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){
|
|
if( db->pClient && db->pClient->iId<=iLsmId ){
|
|
*pbInUse = 1;
|
|
return LSM_OK;
|
|
}
|
|
return isInUse(db, iLsmId, 0, pbInUse);
|
|
}
|
|
|
|
/*
|
|
** This function may only be called after a successful call to
|
|
** lsmDbDatabaseConnect(). It returns true if the connection is in
|
|
** multi-process mode, or false otherwise.
|
|
*/
|
|
int lsmDbMultiProc(lsm_db *pDb){
|
|
return pDb->pDatabase && pDb->pDatabase->bMultiProc;
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
**************************************************************************
|
|
**************************************************************************
|
|
**************************************************************************
|
|
**************************************************************************
|
|
*************************************************************************/
|
|
|
|
/*
|
|
** Ensure that database connection db has cached pointers to at least the
|
|
** first nChunk chunks of shared memory.
|
|
*/
|
|
int lsmShmCacheChunks(lsm_db *db, int nChunk){
|
|
int rc = LSM_OK;
|
|
if( nChunk>db->nShm ){
|
|
static const int NINCR = 16;
|
|
Database *p = db->pDatabase;
|
|
lsm_env *pEnv = db->pEnv;
|
|
int nAlloc;
|
|
int i;
|
|
|
|
/* Ensure that the db->apShm[] array is large enough. If an attempt to
|
|
** allocate memory fails, return LSM_NOMEM immediately. The apShm[] array
|
|
** is always extended in multiples of 16 entries - so the actual allocated
|
|
** size can be inferred from nShm. */
|
|
nAlloc = ((db->nShm + NINCR - 1) / NINCR) * NINCR;
|
|
while( nChunk>=nAlloc ){
|
|
void **apShm;
|
|
nAlloc += NINCR;
|
|
apShm = lsmRealloc(pEnv, db->apShm, sizeof(void*)*nAlloc);
|
|
if( !apShm ) return LSM_NOMEM_BKPT;
|
|
db->apShm = apShm;
|
|
}
|
|
|
|
if( db->bRoTrans ){
|
|
for(i=db->nShm; rc==LSM_OK && i<nChunk; i++){
|
|
db->apShm[i] = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
|
|
db->nShm++;
|
|
}
|
|
|
|
}else{
|
|
|
|
/* Enter the client mutex */
|
|
lsmMutexEnter(pEnv, p->pClientMutex);
|
|
|
|
/* Extend the Database objects apShmChunk[] array if necessary. Using the
|
|
** same pattern as for the lsm_db.apShm[] array above. */
|
|
nAlloc = ((p->nShmChunk + NINCR - 1) / NINCR) * NINCR;
|
|
while( nChunk>=nAlloc ){
|
|
void **apShm;
|
|
nAlloc += NINCR;
|
|
apShm = lsmRealloc(pEnv, p->apShmChunk, sizeof(void*)*nAlloc);
|
|
if( !apShm ){
|
|
rc = LSM_NOMEM_BKPT;
|
|
break;
|
|
}
|
|
p->apShmChunk = apShm;
|
|
}
|
|
|
|
for(i=db->nShm; rc==LSM_OK && i<nChunk; i++){
|
|
if( i>=p->nShmChunk ){
|
|
void *pChunk = 0;
|
|
if( p->bMultiProc==0 ){
|
|
/* Single process mode */
|
|
pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
|
|
}else{
|
|
/* Multi-process mode */
|
|
rc = lsmEnvShmMap(pEnv, p->pFile, i, LSM_SHM_CHUNK_SIZE, &pChunk);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
p->apShmChunk[i] = pChunk;
|
|
p->nShmChunk++;
|
|
}
|
|
}
|
|
if( rc==LSM_OK ){
|
|
db->apShm[i] = p->apShmChunk[i];
|
|
db->nShm++;
|
|
}
|
|
}
|
|
|
|
/* Release the client mutex */
|
|
lsmMutexLeave(pEnv, p->pClientMutex);
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int lockSharedFile(lsm_env *pEnv, Database *p, int iLock, int eOp){
|
|
int rc = LSM_OK;
|
|
if( p->bMultiProc ){
|
|
rc = lsmEnvLock(pEnv, p->pFile, iLock, eOp);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Test if it would be possible for connection db to obtain a lock of type
|
|
** eType on the nLock locks starting at iLock. If so, return LSM_OK. If it
|
|
** would not be possible to obtain the lock due to a lock held by another
|
|
** connection, return LSM_BUSY. If an IO or other error occurs (i.e. in the
|
|
** lsm_env.xTestLock function), return some other LSM error code.
|
|
**
|
|
** Note that this function never actually locks the database - it merely
|
|
** queries the system to see if there exists a lock that would prevent
|
|
** it from doing so.
|
|
*/
|
|
int lsmShmTestLock(
|
|
lsm_db *db,
|
|
int iLock,
|
|
int nLock,
|
|
int eOp
|
|
){
|
|
int rc = LSM_OK;
|
|
lsm_db *pIter;
|
|
Database *p = db->pDatabase;
|
|
int i;
|
|
u64 mask = 0;
|
|
|
|
for(i=iLock; i<(iLock+nLock); i++){
|
|
mask |= ((u64)1 << (iLock-1));
|
|
if( eOp==LSM_LOCK_EXCL ) mask |= ((u64)1 << (iLock+32-1));
|
|
}
|
|
|
|
lsmMutexEnter(db->pEnv, p->pClientMutex);
|
|
for(pIter=p->pConn; pIter; pIter=pIter->pNext){
|
|
if( pIter!=db && (pIter->mLock & mask) ){
|
|
assert( pIter!=db );
|
|
break;
|
|
}
|
|
}
|
|
|
|
if( pIter ){
|
|
rc = LSM_BUSY;
|
|
}else if( p->bMultiProc ){
|
|
rc = lsmEnvTestLock(db->pEnv, p->pFile, iLock, nLock, eOp);
|
|
}
|
|
|
|
lsmMutexLeave(db->pEnv, p->pClientMutex);
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Attempt to obtain the lock identified by the iLock and bExcl parameters.
|
|
** If successful, return LSM_OK. If the lock cannot be obtained because
|
|
** there exists some other conflicting lock, return LSM_BUSY. If some other
|
|
** error occurs, return an LSM error code.
|
|
**
|
|
** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER,
|
|
** or else a value returned by the LSM_LOCK_READER macro.
|
|
*/
|
|
int lsmShmLock(
|
|
lsm_db *db,
|
|
int iLock,
|
|
int eOp, /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */
|
|
int bBlock /* True for a blocking lock */
|
|
){
|
|
lsm_db *pIter;
|
|
const u64 me = ((u64)1 << (iLock-1));
|
|
const u64 ms = ((u64)1 << (iLock+32-1));
|
|
int rc = LSM_OK;
|
|
Database *p = db->pDatabase;
|
|
|
|
assert( eOp!=LSM_LOCK_EXCL || p->bReadonly==0 );
|
|
assert( iLock>=1 && iLock<=LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1) );
|
|
assert( LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1)<=32 );
|
|
assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
|
|
|
|
/* Check for a no-op. Proceed only if this is not one of those. */
|
|
if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0)
|
|
|| (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms)
|
|
|| (eOp==LSM_LOCK_EXCL && (db->mLock & me)==0)
|
|
){
|
|
int nExcl = 0; /* Number of connections holding EXCLUSIVE */
|
|
int nShared = 0; /* Number of connections holding SHARED */
|
|
lsmMutexEnter(db->pEnv, p->pClientMutex);
|
|
|
|
/* Figure out the locks currently held by this process on iLock, not
|
|
** including any held by connection db. */
|
|
for(pIter=p->pConn; pIter; pIter=pIter->pNext){
|
|
assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 );
|
|
if( pIter!=db ){
|
|
if( pIter->mLock & me ){
|
|
nExcl++;
|
|
}else if( pIter->mLock & ms ){
|
|
nShared++;
|
|
}
|
|
}
|
|
}
|
|
assert( nExcl==0 || nExcl==1 );
|
|
assert( nExcl==0 || nShared==0 );
|
|
assert( nExcl==0 || (db->mLock & (me|ms))==0 );
|
|
|
|
switch( eOp ){
|
|
case LSM_LOCK_UNLOCK:
|
|
if( nShared==0 ){
|
|
lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_UNLOCK);
|
|
}
|
|
db->mLock &= ~(me|ms);
|
|
break;
|
|
|
|
case LSM_LOCK_SHARED:
|
|
if( nExcl ){
|
|
rc = LSM_BUSY;
|
|
}else{
|
|
if( nShared==0 ){
|
|
rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_SHARED);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
db->mLock |= ms;
|
|
db->mLock &= ~me;
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
assert( eOp==LSM_LOCK_EXCL );
|
|
if( nExcl || nShared ){
|
|
rc = LSM_BUSY;
|
|
}else{
|
|
rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_EXCL);
|
|
if( rc==LSM_OK ){
|
|
db->mLock |= (me|ms);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
lsmMutexLeave(db->pEnv, p->pClientMutex);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
#ifdef LSM_DEBUG
|
|
|
|
int shmLockType(lsm_db *db, int iLock){
|
|
const u64 me = ((u64)1 << (iLock-1));
|
|
const u64 ms = ((u64)1 << (iLock+32-1));
|
|
|
|
if( db->mLock & me ) return LSM_LOCK_EXCL;
|
|
if( db->mLock & ms ) return LSM_LOCK_SHARED;
|
|
return LSM_LOCK_UNLOCK;
|
|
}
|
|
|
|
/*
|
|
** The arguments passed to this function are similar to those passed to
|
|
** the lsmShmLock() function. However, instead of obtaining a new lock
|
|
** this function returns true if the specified connection already holds
|
|
** (or does not hold) such a lock, depending on the value of eOp. As
|
|
** follows:
|
|
**
|
|
** (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock
|
|
** (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock.
|
|
** (eOp==LSM_LOCK_EXCL) -> true if db has an EXCLUSIVE lock on iLock.
|
|
*/
|
|
int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){
|
|
int ret = 0;
|
|
int eHave;
|
|
|
|
assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
|
|
assert( iLock<=16 );
|
|
assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
|
|
|
|
eHave = shmLockType(db, iLock);
|
|
|
|
switch( eOp ){
|
|
case LSM_LOCK_UNLOCK:
|
|
ret = (eHave==LSM_LOCK_UNLOCK);
|
|
break;
|
|
case LSM_LOCK_SHARED:
|
|
ret = (eHave!=LSM_LOCK_UNLOCK);
|
|
break;
|
|
case LSM_LOCK_EXCL:
|
|
ret = (eHave==LSM_LOCK_EXCL);
|
|
break;
|
|
default:
|
|
assert( !"bad eOp value passed to lsmShmAssertLock()" );
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
int lsmShmAssertWorker(lsm_db *db){
|
|
return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker;
|
|
}
|
|
|
|
/*
|
|
** This function does not contribute to library functionality, and is not
|
|
** included in release builds. It is intended to be called from within
|
|
** an interactive debugger.
|
|
**
|
|
** When called, this function prints a single line of human readable output
|
|
** to stdout describing the locks currently held by the connection. For
|
|
** example:
|
|
**
|
|
** (gdb) call print_db_locks(pDb)
|
|
** (shared on dms2) (exclusive on writer)
|
|
*/
|
|
void print_db_locks(lsm_db *db){
|
|
int iLock;
|
|
for(iLock=0; iLock<16; iLock++){
|
|
int bOne = 0;
|
|
const char *azLock[] = {0, "shared", "exclusive"};
|
|
const char *azName[] = {
|
|
0, "dms1", "dms2", "writer", "worker", "checkpointer",
|
|
"reader0", "reader1", "reader2", "reader3", "reader4", "reader5"
|
|
};
|
|
int eHave = shmLockType(db, iLock);
|
|
if( azLock[eHave] ){
|
|
printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]);
|
|
bOne = 1;
|
|
}
|
|
}
|
|
printf("\n");
|
|
}
|
|
void print_all_db_locks(lsm_db *db){
|
|
lsm_db *p;
|
|
for(p=db->pDatabase->pConn; p; p=p->pNext){
|
|
printf("%s connection %p ", ((p==db)?"*":""), p);
|
|
print_db_locks(p);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void lsmShmBarrier(lsm_db *db){
|
|
lsmEnvShmBarrier(db->pEnv);
|
|
}
|
|
|
|
int lsm_checkpoint(lsm_db *pDb, int *pnKB){
|
|
int rc; /* Return code */
|
|
u32 nWrite = 0; /* Number of pages checkpointed */
|
|
|
|
/* Attempt the checkpoint. If successful, nWrite is set to the number of
|
|
** pages written between this and the previous checkpoint. */
|
|
rc = lsmCheckpointWrite(pDb, 0, &nWrite);
|
|
|
|
/* If required, calculate the output variable (KB of data checkpointed).
|
|
** Set it to zero if an error occured. */
|
|
if( pnKB ){
|
|
int nKB = 0;
|
|
if( rc==LSM_OK && nWrite ){
|
|
nKB = (((i64)nWrite * lsmFsPageSize(pDb->pFS)) + 1023) / 1024;
|
|
}
|
|
*pnKB = nKB;
|
|
}
|
|
|
|
return rc;
|
|
}
|