sqlite/ext/lsm1/lsm_sorted.c
dan ae2e7fcc4c Fix another issue with very large compressed LSM databases.
FossilOrigin-Name: d71154265a294a1ece89d257f55f6855db7c30aec55ea0dc4eeb61bce1e8fad3
2023-01-17 19:34:01 +00:00

6196 lines
180 KiB
C

/*
** 2011-08-14
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** PAGE FORMAT:
**
** The maximum page size is 65536 bytes.
**
** Since all records are equal to or larger than 2 bytes in size, and
** some space within the page is consumed by the page footer, there must
** be less than 2^15 records on each page.
**
** Each page ends with a footer that describes the pages contents. This
** footer serves as similar purpose to the page header in an SQLite database.
** A footer is used instead of a header because it makes it easier to
** populate a new page based on a sorted list of key/value pairs.
**
** The footer consists of the following values (starting at the end of
** the page and continuing backwards towards the start). All values are
** stored as unsigned big-endian integers.
**
** * Number of records on page (2 bytes).
** * Flags field (2 bytes).
** * Left-hand pointer value (8 bytes).
** * The starting offset of each record (2 bytes per record).
**
** Records may span pages. Unless it happens to be an exact fit, the part
** of the final record that starts on page X that does not fit on page X
** is stored at the start of page (X+1). This means there may be pages where
** (N==0). And on most pages the first record that starts on the page will
** not start at byte offset 0. For example:
**
** aaaaa bbbbb ccc <footer> cc eeeee fffff g <footer> gggg....
**
** RECORD FORMAT:
**
** The first byte of the record is a flags byte. It is a combination
** of the following flags (defined in lsmInt.h):
**
** LSM_START_DELETE
** LSM_END_DELETE
** LSM_POINT_DELETE
** LSM_INSERT
** LSM_SEPARATOR
** LSM_SYSTEMKEY
**
** Immediately following the type byte is a pointer to the smallest key
** in the next file that is larger than the key in the current record. The
** pointer is encoded as a varint. When added to the 32-bit page number
** stored in the footer, it is the page number of the page that contains the
** smallest key in the next sorted file that is larger than this key.
**
** Next is the number of bytes in the key, encoded as a varint.
**
** If the LSM_INSERT flag is set, the number of bytes in the value, as
** a varint, is next.
**
** Finally, the blob of data containing the key, and for LSM_INSERT
** records, the value as well.
*/
#ifndef _LSM_INT_H
# include "lsmInt.h"
#endif
#define LSM_LOG_STRUCTURE 0
#define LSM_LOG_DATA 0
/*
** Macros to help decode record types.
*/
#define rtTopic(eType) ((eType) & LSM_SYSTEMKEY)
#define rtIsDelete(eType) (((eType) & 0x0F)==LSM_POINT_DELETE)
#define rtIsSeparator(eType) (((eType) & LSM_SEPARATOR)!=0)
#define rtIsWrite(eType) (((eType) & LSM_INSERT)!=0)
#define rtIsSystem(eType) (((eType) & LSM_SYSTEMKEY)!=0)
/*
** The following macros are used to access a page footer.
*/
#define SEGMENT_NRECORD_OFFSET(pgsz) ((pgsz) - 2)
#define SEGMENT_FLAGS_OFFSET(pgsz) ((pgsz) - 2 - 2)
#define SEGMENT_POINTER_OFFSET(pgsz) ((pgsz) - 2 - 2 - 8)
#define SEGMENT_CELLPTR_OFFSET(pgsz, iCell) ((pgsz) - 2 - 2 - 8 - 2 - (iCell)*2)
#define SEGMENT_EOF(pgsz, nEntry) SEGMENT_CELLPTR_OFFSET(pgsz, nEntry-1)
#define SEGMENT_BTREE_FLAG 0x0001
#define PGFTR_SKIP_NEXT_FLAG 0x0002
#define PGFTR_SKIP_THIS_FLAG 0x0004
#ifndef LSM_SEGMENTPTR_FREE_THRESHOLD
# define LSM_SEGMENTPTR_FREE_THRESHOLD 1024
#endif
typedef struct SegmentPtr SegmentPtr;
typedef struct LsmBlob LsmBlob;
struct LsmBlob {
lsm_env *pEnv;
void *pData;
int nData;
int nAlloc;
};
/*
** A SegmentPtr object may be used for one of two purposes:
**
** * To iterate and/or seek within a single Segment (the combination of a
** main run and an optional sorted run).
**
** * To iterate through the separators array of a segment.
*/
struct SegmentPtr {
Level *pLevel; /* Level object segment is part of */
Segment *pSeg; /* Segment to access */
/* Current page. See segmentPtrLoadPage(). */
Page *pPg; /* Current page */
u16 flags; /* Copy of page flags field */
int nCell; /* Number of cells on pPg */
LsmPgno iPtr; /* Base cascade pointer */
/* Current cell. See segmentPtrLoadCell() */
int iCell; /* Current record within page pPg */
int eType; /* Type of current record */
LsmPgno iPgPtr; /* Cascade pointer offset */
void *pKey; int nKey; /* Key associated with current record */
void *pVal; int nVal; /* Current record value (eType==WRITE only) */
/* Blobs used to allocate buffers for pKey and pVal as required */
LsmBlob blob1;
LsmBlob blob2;
};
/*
** Used to iterate through the keys stored in a b-tree hierarchy from start
** to finish. Only First() and Next() operations are required.
**
** btreeCursorNew()
** btreeCursorFirst()
** btreeCursorNext()
** btreeCursorFree()
** btreeCursorPosition()
** btreeCursorRestore()
*/
typedef struct BtreePg BtreePg;
typedef struct BtreeCursor BtreeCursor;
struct BtreePg {
Page *pPage;
int iCell;
};
struct BtreeCursor {
Segment *pSeg; /* Iterate through this segments btree */
FileSystem *pFS; /* File system to read pages from */
int nDepth; /* Allocated size of aPg[] */
int iPg; /* Current entry in aPg[]. -1 -> EOF. */
BtreePg *aPg; /* Pages from root to current location */
/* Cache of current entry. pKey==0 for EOF. */
void *pKey;
int nKey;
int eType;
LsmPgno iPtr;
/* Storage for key, if not local */
LsmBlob blob;
};
/*
** A cursor used for merged searches or iterations through up to one
** Tree structure and any number of sorted files.
**
** lsmMCursorNew()
** lsmMCursorSeek()
** lsmMCursorNext()
** lsmMCursorPrev()
** lsmMCursorFirst()
** lsmMCursorLast()
** lsmMCursorKey()
** lsmMCursorValue()
** lsmMCursorValid()
**
** iFree:
** This variable is only used by cursors providing input data for a
** new top-level segment. Such cursors only ever iterate forwards, not
** backwards.
*/
struct MultiCursor {
lsm_db *pDb; /* Connection that owns this cursor */
MultiCursor *pNext; /* Next cursor owned by connection pDb */
int flags; /* Mask of CURSOR_XXX flags */
int eType; /* Cache of current key type */
LsmBlob key; /* Cache of current key (or NULL) */
LsmBlob val; /* Cache of current value */
/* All the component cursors: */
TreeCursor *apTreeCsr[2]; /* Up to two tree cursors */
int iFree; /* Next element of free-list (-ve for eof) */
SegmentPtr *aPtr; /* Array of segment pointers */
int nPtr; /* Size of array aPtr[] */
BtreeCursor *pBtCsr; /* b-tree cursor (db writes only) */
/* Comparison results */
int nTree; /* Size of aTree[] array */
int *aTree; /* Array of comparison results */
/* Used by cursors flushing the in-memory tree only */
void *pSystemVal; /* Pointer to buffer to free */
/* Used by worker cursors only */
LsmPgno *pPrevMergePtr;
};
/*
** The following constants are used to assign integers to each component
** cursor of a multi-cursor.
*/
#define CURSOR_DATA_TREE0 0 /* Current tree cursor (apTreeCsr[0]) */
#define CURSOR_DATA_TREE1 1 /* The "old" tree, if any (apTreeCsr[1]) */
#define CURSOR_DATA_SYSTEM 2 /* Free-list entries (new-toplevel only) */
#define CURSOR_DATA_SEGMENT 3 /* First segment pointer (aPtr[0]) */
/*
** CURSOR_IGNORE_DELETE
** If set, this cursor will not visit SORTED_DELETE keys.
**
** CURSOR_FLUSH_FREELIST
** This cursor is being used to create a new toplevel. It should also
** iterate through the contents of the in-memory free block list.
**
** CURSOR_IGNORE_SYSTEM
** If set, this cursor ignores system keys.
**
** CURSOR_NEXT_OK
** Set if it is Ok to call lsm_csr_next().
**
** CURSOR_PREV_OK
** Set if it is Ok to call lsm_csr_prev().
**
** CURSOR_READ_SEPARATORS
** Set if this cursor should visit the separator keys in segment
** aPtr[nPtr-1].
**
** CURSOR_SEEK_EQ
** Cursor has undergone a successful lsm_csr_seek(LSM_SEEK_EQ) operation.
** The key and value are stored in MultiCursor.key and MultiCursor.val
** respectively.
*/
#define CURSOR_IGNORE_DELETE 0x00000001
#define CURSOR_FLUSH_FREELIST 0x00000002
#define CURSOR_IGNORE_SYSTEM 0x00000010
#define CURSOR_NEXT_OK 0x00000020
#define CURSOR_PREV_OK 0x00000040
#define CURSOR_READ_SEPARATORS 0x00000080
#define CURSOR_SEEK_EQ 0x00000100
typedef struct MergeWorker MergeWorker;
typedef struct Hierarchy Hierarchy;
struct Hierarchy {
Page **apHier;
int nHier;
};
/*
** aSave:
** When mergeWorkerNextPage() is called to advance to the next page in
** the output segment, if the bStore flag for an element of aSave[] is
** true, it is cleared and the corresponding iPgno value is set to the
** page number of the page just completed.
**
** aSave[0] is used to record the pointer value to be pushed into the
** b-tree hierarchy. aSave[1] is used to save the page number of the
** page containing the indirect key most recently written to the b-tree.
** see mergeWorkerPushHierarchy() for details.
*/
struct MergeWorker {
lsm_db *pDb; /* Database handle */
Level *pLevel; /* Worker snapshot Level being merged */
MultiCursor *pCsr; /* Cursor to read new segment contents from */
int bFlush; /* True if this is an in-memory tree flush */
Hierarchy hier; /* B-tree hierarchy under construction */
Page *pPage; /* Current output page */
int nWork; /* Number of calls to mergeWorkerNextPage() */
LsmPgno *aGobble; /* Gobble point for each input segment */
LsmPgno iIndirect;
struct SavedPgno {
LsmPgno iPgno;
int bStore;
} aSave[2];
};
#ifdef LSM_DEBUG_EXPENSIVE
static int assertPointersOk(lsm_db *, Segment *, Segment *, int);
static int assertBtreeOk(lsm_db *, Segment *);
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg);
#else
#define assertRunInOrder(x,y)
#define assertBtreeOk(x,y)
#endif
struct FilePage { u8 *aData; int nData; };
static u8 *fsPageData(Page *pPg, int *pnData){
*pnData = ((struct FilePage *)(pPg))->nData;
return ((struct FilePage *)(pPg))->aData;
}
/*UNUSED static u8 *fsPageDataPtr(Page *pPg){
return ((struct FilePage *)(pPg))->aData;
}*/
/*
** Write nVal as a 16-bit unsigned big-endian integer into buffer aOut.
*/
void lsmPutU16(u8 *aOut, u16 nVal){
aOut[0] = (u8)((nVal>>8) & 0xFF);
aOut[1] = (u8)(nVal & 0xFF);
}
void lsmPutU32(u8 *aOut, u32 nVal){
aOut[0] = (u8)((nVal>>24) & 0xFF);
aOut[1] = (u8)((nVal>>16) & 0xFF);
aOut[2] = (u8)((nVal>> 8) & 0xFF);
aOut[3] = (u8)((nVal ) & 0xFF);
}
int lsmGetU16(u8 *aOut){
return (aOut[0] << 8) + aOut[1];
}
u32 lsmGetU32(u8 *aOut){
return ((u32)aOut[0] << 24)
+ ((u32)aOut[1] << 16)
+ ((u32)aOut[2] << 8)
+ ((u32)aOut[3]);
}
u64 lsmGetU64(u8 *aOut){
return ((u64)aOut[0] << 56)
+ ((u64)aOut[1] << 48)
+ ((u64)aOut[2] << 40)
+ ((u64)aOut[3] << 32)
+ ((u64)aOut[4] << 24)
+ ((u32)aOut[5] << 16)
+ ((u32)aOut[6] << 8)
+ ((u32)aOut[7]);
}
void lsmPutU64(u8 *aOut, u64 nVal){
aOut[0] = (u8)((nVal>>56) & 0xFF);
aOut[1] = (u8)((nVal>>48) & 0xFF);
aOut[2] = (u8)((nVal>>40) & 0xFF);
aOut[3] = (u8)((nVal>>32) & 0xFF);
aOut[4] = (u8)((nVal>>24) & 0xFF);
aOut[5] = (u8)((nVal>>16) & 0xFF);
aOut[6] = (u8)((nVal>> 8) & 0xFF);
aOut[7] = (u8)((nVal ) & 0xFF);
}
static int sortedBlobGrow(lsm_env *pEnv, LsmBlob *pBlob, int nData){
assert( pBlob->pEnv==pEnv || (pBlob->pEnv==0 && pBlob->pData==0) );
if( pBlob->nAlloc<nData ){
pBlob->pData = lsmReallocOrFree(pEnv, pBlob->pData, nData);
if( !pBlob->pData ) return LSM_NOMEM_BKPT;
pBlob->nAlloc = nData;
pBlob->pEnv = pEnv;
}
return LSM_OK;
}
static int sortedBlobSet(lsm_env *pEnv, LsmBlob *pBlob, void *pData, int nData){
if( sortedBlobGrow(pEnv, pBlob, nData) ) return LSM_NOMEM;
memcpy(pBlob->pData, pData, nData);
pBlob->nData = nData;
return LSM_OK;
}
#if 0
static int sortedBlobCopy(LsmBlob *pDest, LsmBlob *pSrc){
return sortedBlobSet(pDest, pSrc->pData, pSrc->nData);
}
#endif
static void sortedBlobFree(LsmBlob *pBlob){
assert( pBlob->pEnv || pBlob->pData==0 );
if( pBlob->pData ) lsmFree(pBlob->pEnv, pBlob->pData);
memset(pBlob, 0, sizeof(LsmBlob));
}
static int sortedReadData(
Segment *pSeg,
Page *pPg,
int iOff,
int nByte,
void **ppData,
LsmBlob *pBlob
){
int rc = LSM_OK;
int iEnd;
int nData;
int nCell;
u8 *aData;
aData = fsPageData(pPg, &nData);
nCell = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
iEnd = SEGMENT_EOF(nData, nCell);
assert( iEnd>0 && iEnd<nData );
if( iOff+nByte<=iEnd ){
*ppData = (void *)&aData[iOff];
}else{
int nRem = nByte;
int i = iOff;
u8 *aDest;
/* Make sure the blob is big enough to store the value being loaded. */
rc = sortedBlobGrow(lsmPageEnv(pPg), pBlob, nByte);
if( rc!=LSM_OK ) return rc;
pBlob->nData = nByte;
aDest = (u8 *)pBlob->pData;
*ppData = pBlob->pData;
/* Increment the pointer pages ref-count. */
lsmFsPageRef(pPg);
while( rc==LSM_OK ){
Page *pNext;
int flags;
/* Copy data from pPg into the output buffer. */
int nCopy = LSM_MIN(nRem, iEnd-i);
if( nCopy>0 ){
memcpy(&aDest[nByte-nRem], &aData[i], nCopy);
nRem -= nCopy;
i += nCopy;
assert( nRem==0 || i==iEnd );
}
assert( nRem>=0 );
if( nRem==0 ) break;
i -= iEnd;
/* Grab the next page in the segment */
do {
rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
if( rc==LSM_OK && pNext==0 ){
rc = LSM_CORRUPT_BKPT;
}
if( rc ) break;
lsmFsPageRelease(pPg);
pPg = pNext;
aData = fsPageData(pPg, &nData);
flags = lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
}while( flags&SEGMENT_BTREE_FLAG );
iEnd = SEGMENT_EOF(nData, lsmGetU16(&aData[nData-2]));
assert( iEnd>0 && iEnd<nData );
}
lsmFsPageRelease(pPg);
}
return rc;
}
static int pageGetNRec(u8 *aData, int nData){
return (int)lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
}
static LsmPgno pageGetPtr(u8 *aData, int nData){
return (LsmPgno)lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]);
}
static int pageGetFlags(u8 *aData, int nData){
return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]);
}
static u8 *pageGetCell(u8 *aData, int nData, int iCell){
return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])];
}
/*
** Return the number of cells on page pPg.
*/
static int pageObjGetNRec(Page *pPg){
int nData;
u8 *aData = lsmFsPageData(pPg, &nData);
return pageGetNRec(aData, nData);
}
/*
** Return the decoded (possibly relative) pointer value stored in cell
** iCell from page aData/nData.
*/
static LsmPgno pageGetRecordPtr(u8 *aData, int nData, int iCell){
LsmPgno iRet; /* Return value */
u8 *aCell; /* Pointer to cell iCell */
assert( iCell<pageGetNRec(aData, nData) && iCell>=0 );
aCell = pageGetCell(aData, nData, iCell);
lsmVarintGet64(&aCell[1], &iRet);
return iRet;
}
static u8 *pageGetKey(
Segment *pSeg, /* Segment pPg belongs to */
Page *pPg, /* Page to read from */
int iCell, /* Index of cell on page to read */
int *piTopic, /* OUT: Topic associated with this key */
int *pnKey, /* OUT: Size of key in bytes */
LsmBlob *pBlob /* If required, use this for dynamic memory */
){
u8 *pKey;
i64 nDummy;
int eType;
u8 *aData;
int nData;
aData = fsPageData(pPg, &nData);
assert( !(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) );
assert( iCell<pageGetNRec(aData, nData) );
pKey = pageGetCell(aData, nData, iCell);
eType = *pKey++;
pKey += lsmVarintGet64(pKey, &nDummy);
pKey += lsmVarintGet32(pKey, pnKey);
if( rtIsWrite(eType) ){
pKey += lsmVarintGet64(pKey, &nDummy);
}
*piTopic = rtTopic(eType);
sortedReadData(pSeg, pPg, pKey-aData, *pnKey, (void **)&pKey, pBlob);
return pKey;
}
static int pageGetKeyCopy(
lsm_env *pEnv, /* Environment handle */
Segment *pSeg, /* Segment pPg belongs to */
Page *pPg, /* Page to read from */
int iCell, /* Index of cell on page to read */
int *piTopic, /* OUT: Topic associated with this key */
LsmBlob *pBlob /* If required, use this for dynamic memory */
){
int rc = LSM_OK;
int nKey;
u8 *aKey;
aKey = pageGetKey(pSeg, pPg, iCell, piTopic, &nKey, pBlob);
assert( (void *)aKey!=pBlob->pData || nKey==pBlob->nData );
if( (void *)aKey!=pBlob->pData ){
rc = sortedBlobSet(pEnv, pBlob, aKey, nKey);
}
return rc;
}
static LsmPgno pageGetBtreeRef(Page *pPg, int iKey){
LsmPgno iRef;
u8 *aData;
int nData;
u8 *aCell;
aData = fsPageData(pPg, &nData);
aCell = pageGetCell(aData, nData, iKey);
assert( aCell[0]==0 );
aCell++;
aCell += lsmVarintGet64(aCell, &iRef);
lsmVarintGet64(aCell, &iRef);
assert( iRef>0 );
return iRef;
}
#define GETVARINT64(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet64((a), &(i)))
#define GETVARINT32(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet32((a), &(i)))
static int pageGetBtreeKey(
Segment *pSeg, /* Segment page pPg belongs to */
Page *pPg,
int iKey,
LsmPgno *piPtr,
int *piTopic,
void **ppKey,
int *pnKey,
LsmBlob *pBlob
){
u8 *aData;
int nData;
u8 *aCell;
int eType;
aData = fsPageData(pPg, &nData);
assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) );
assert( iKey>=0 && iKey<pageGetNRec(aData, nData) );
aCell = pageGetCell(aData, nData, iKey);
eType = *aCell++;
aCell += GETVARINT64(aCell, *piPtr);
if( eType==0 ){
int rc;
LsmPgno iRef; /* Page number of referenced page */
Page *pRef;
aCell += GETVARINT64(aCell, iRef);
rc = lsmFsDbPageGet(lsmPageFS(pPg), pSeg, iRef, &pRef);
if( rc!=LSM_OK ) return rc;
pageGetKeyCopy(lsmPageEnv(pPg), pSeg, pRef, 0, &eType, pBlob);
lsmFsPageRelease(pRef);
*ppKey = pBlob->pData;
*pnKey = pBlob->nData;
}else{
aCell += GETVARINT32(aCell, *pnKey);
*ppKey = aCell;
}
if( piTopic ) *piTopic = rtTopic(eType);
return LSM_OK;
}
static int btreeCursorLoadKey(BtreeCursor *pCsr){
int rc = LSM_OK;
if( pCsr->iPg<0 ){
pCsr->pKey = 0;
pCsr->nKey = 0;
pCsr->eType = 0;
}else{
LsmPgno dummy;
int iPg = pCsr->iPg;
int iCell = pCsr->aPg[iPg].iCell;
while( iCell<0 && (--iPg)>=0 ){
iCell = pCsr->aPg[iPg].iCell-1;
}
if( iPg<0 || iCell<0 ) return LSM_CORRUPT_BKPT;
rc = pageGetBtreeKey(
pCsr->pSeg,
pCsr->aPg[iPg].pPage, iCell,
&dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
);
pCsr->eType |= LSM_SEPARATOR;
}
return rc;
}
static LsmPgno btreeCursorPtr(u8 *aData, int nData, int iCell){
int nCell;
nCell = pageGetNRec(aData, nData);
if( iCell>=nCell ){
return pageGetPtr(aData, nData);
}
return pageGetRecordPtr(aData, nData, iCell);
}
static int btreeCursorNext(BtreeCursor *pCsr){
int rc = LSM_OK;
BtreePg *pPg = &pCsr->aPg[pCsr->iPg];
int nCell;
u8 *aData;
int nData;
assert( pCsr->iPg>=0 );
assert( pCsr->iPg==pCsr->nDepth-1 );
aData = fsPageData(pPg->pPage, &nData);
nCell = pageGetNRec(aData, nData);
assert( pPg->iCell<=nCell );
pPg->iCell++;
if( pPg->iCell==nCell ){
LsmPgno iLoad;
/* Up to parent. */
lsmFsPageRelease(pPg->pPage);
pPg->pPage = 0;
pCsr->iPg--;
while( pCsr->iPg>=0 ){
pPg = &pCsr->aPg[pCsr->iPg];
aData = fsPageData(pPg->pPage, &nData);
if( pPg->iCell<pageGetNRec(aData, nData) ) break;
lsmFsPageRelease(pPg->pPage);
pCsr->iPg--;
}
/* Read the key */
rc = btreeCursorLoadKey(pCsr);
/* Unless the cursor is at EOF, descend to cell -1 (yes, negative one) of
** the left-most most descendent. */
if( pCsr->iPg>=0 ){
pCsr->aPg[pCsr->iPg].iCell++;
iLoad = btreeCursorPtr(aData, nData, pPg->iCell);
do {
Page *pLoad;
pCsr->iPg++;
rc = lsmFsDbPageGet(pCsr->pFS, pCsr->pSeg, iLoad, &pLoad);
pCsr->aPg[pCsr->iPg].pPage = pLoad;
pCsr->aPg[pCsr->iPg].iCell = 0;
if( rc==LSM_OK ){
if( pCsr->iPg==(pCsr->nDepth-1) ) break;
aData = fsPageData(pLoad, &nData);
iLoad = btreeCursorPtr(aData, nData, 0);
}
}while( rc==LSM_OK && pCsr->iPg<(pCsr->nDepth-1) );
pCsr->aPg[pCsr->iPg].iCell = -1;
}
}else{
rc = btreeCursorLoadKey(pCsr);
}
if( rc==LSM_OK && pCsr->iPg>=0 ){
aData = fsPageData(pCsr->aPg[pCsr->iPg].pPage, &nData);
pCsr->iPtr = btreeCursorPtr(aData, nData, pCsr->aPg[pCsr->iPg].iCell+1);
}
return rc;
}
static void btreeCursorFree(BtreeCursor *pCsr){
if( pCsr ){
int i;
lsm_env *pEnv = lsmFsEnv(pCsr->pFS);
for(i=0; i<=pCsr->iPg; i++){
lsmFsPageRelease(pCsr->aPg[i].pPage);
}
sortedBlobFree(&pCsr->blob);
lsmFree(pEnv, pCsr->aPg);
lsmFree(pEnv, pCsr);
}
}
static int btreeCursorFirst(BtreeCursor *pCsr){
int rc;
Page *pPg = 0;
FileSystem *pFS = pCsr->pFS;
LsmPgno iPg = pCsr->pSeg->iRoot;
do {
rc = lsmFsDbPageGet(pFS, pCsr->pSeg, iPg, &pPg);
assert( (rc==LSM_OK)==(pPg!=0) );
if( rc==LSM_OK ){
u8 *aData;
int nData;
int flags;
aData = fsPageData(pPg, &nData);
flags = pageGetFlags(aData, nData);
if( (flags & SEGMENT_BTREE_FLAG)==0 ) break;
if( (pCsr->nDepth % 8)==0 ){
int nNew = pCsr->nDepth + 8;
pCsr->aPg = (BtreePg *)lsmReallocOrFreeRc(
lsmFsEnv(pFS), pCsr->aPg, sizeof(BtreePg) * nNew, &rc
);
if( rc==LSM_OK ){
memset(&pCsr->aPg[pCsr->nDepth], 0, sizeof(BtreePg) * 8);
}
}
if( rc==LSM_OK ){
assert( pCsr->aPg[pCsr->nDepth].iCell==0 );
pCsr->aPg[pCsr->nDepth].pPage = pPg;
pCsr->nDepth++;
iPg = pageGetRecordPtr(aData, nData, 0);
}
}
}while( rc==LSM_OK );
lsmFsPageRelease(pPg);
pCsr->iPg = pCsr->nDepth-1;
if( rc==LSM_OK && pCsr->nDepth ){
pCsr->aPg[pCsr->iPg].iCell = -1;
rc = btreeCursorNext(pCsr);
}
return rc;
}
static void btreeCursorPosition(BtreeCursor *pCsr, MergeInput *p){
if( pCsr->iPg>=0 ){
p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage);
p->iCell = ((pCsr->aPg[pCsr->iPg].iCell + 1) << 8) + pCsr->nDepth;
}else{
p->iPg = 0;
p->iCell = 0;
}
}
static void btreeCursorSplitkey(BtreeCursor *pCsr, MergeInput *p){
int iCell = pCsr->aPg[pCsr->iPg].iCell;
if( iCell>=0 ){
p->iCell = iCell;
p->iPg = lsmFsPageNumber(pCsr->aPg[pCsr->iPg].pPage);
}else{
int i;
for(i=pCsr->iPg-1; i>=0; i--){
if( pCsr->aPg[i].iCell>0 ) break;
}
assert( i>=0 );
p->iCell = pCsr->aPg[i].iCell-1;
p->iPg = lsmFsPageNumber(pCsr->aPg[i].pPage);
}
}
static int sortedKeyCompare(
int (*xCmp)(void *, int, void *, int),
int iLhsTopic, void *pLhsKey, int nLhsKey,
int iRhsTopic, void *pRhsKey, int nRhsKey
){
int res = iLhsTopic - iRhsTopic;
if( res==0 ){
res = xCmp(pLhsKey, nLhsKey, pRhsKey, nRhsKey);
}
return res;
}
static int btreeCursorRestore(
BtreeCursor *pCsr,
int (*xCmp)(void *, int, void *, int),
MergeInput *p
){
int rc = LSM_OK;
if( p->iPg ){
lsm_env *pEnv = lsmFsEnv(pCsr->pFS);
int iCell; /* Current cell number on leaf page */
LsmPgno iLeaf; /* Page number of current leaf page */
int nDepth; /* Depth of b-tree structure */
Segment *pSeg = pCsr->pSeg;
/* Decode the MergeInput structure */
iLeaf = p->iPg;
nDepth = (p->iCell & 0x00FF);
iCell = (p->iCell >> 8) - 1;
/* Allocate the BtreeCursor.aPg[] array */
assert( pCsr->aPg==0 );
pCsr->aPg = (BtreePg *)lsmMallocZeroRc(pEnv, sizeof(BtreePg) * nDepth, &rc);
/* Populate the last entry of the aPg[] array */
if( rc==LSM_OK ){
Page **pp = &pCsr->aPg[nDepth-1].pPage;
pCsr->iPg = nDepth-1;
pCsr->nDepth = nDepth;
pCsr->aPg[pCsr->iPg].iCell = iCell;
rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLeaf, pp);
}
/* Populate any other aPg[] array entries */
if( rc==LSM_OK && nDepth>1 ){
LsmBlob blob = {0,0,0};
void *pSeek;
int nSeek;
int iTopicSeek;
int iPg = 0;
LsmPgno iLoad = pSeg->iRoot;
Page *pPg = pCsr->aPg[nDepth-1].pPage;
if( pageObjGetNRec(pPg)==0 ){
/* This can happen when pPg is the right-most leaf in the b-tree.
** In this case, set the iTopicSeek/pSeek/nSeek key to a value
** greater than any real key. */
assert( iCell==-1 );
iTopicSeek = 1000;
pSeek = 0;
nSeek = 0;
}else{
LsmPgno dummy;
rc = pageGetBtreeKey(pSeg, pPg,
0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
);
}
do {
Page *pPg2;
rc = lsmFsDbPageGet(pCsr->pFS, pSeg, iLoad, &pPg2);
assert( rc==LSM_OK || pPg2==0 );
if( rc==LSM_OK ){
u8 *aData; /* Buffer containing page data */
int nData; /* Size of aData[] in bytes */
int iMin;
int iMax;
int iCell2;
aData = fsPageData(pPg2, &nData);
assert( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) );
iLoad = pageGetPtr(aData, nData);
iCell2 = pageGetNRec(aData, nData);
iMax = iCell2-1;
iMin = 0;
while( iMax>=iMin ){
int iTry = (iMin+iMax)/2;
void *pKey; int nKey; /* Key for cell iTry */
int iTopic; /* Topic for key pKeyT/nKeyT */
LsmPgno iPtr; /* Pointer for cell iTry */
int res; /* (pSeek - pKeyT) */
rc = pageGetBtreeKey(
pSeg, pPg2, iTry, &iPtr, &iTopic, &pKey, &nKey, &blob
);
if( rc!=LSM_OK ) break;
res = sortedKeyCompare(
xCmp, iTopicSeek, pSeek, nSeek, iTopic, pKey, nKey
);
assert( res!=0 );
if( res<0 ){
iLoad = iPtr;
iCell2 = iTry;
iMax = iTry-1;
}else{
iMin = iTry+1;
}
}
pCsr->aPg[iPg].pPage = pPg2;
pCsr->aPg[iPg].iCell = iCell2;
iPg++;
assert( iPg!=nDepth-1
|| lsmFsRedirectPage(pCsr->pFS, pSeg->pRedirect, iLoad)==iLeaf
);
}
}while( rc==LSM_OK && iPg<(nDepth-1) );
sortedBlobFree(&blob);
}
/* Load the current key and pointer */
if( rc==LSM_OK ){
BtreePg *pBtreePg;
u8 *aData;
int nData;
pBtreePg = &pCsr->aPg[pCsr->iPg];
aData = fsPageData(pBtreePg->pPage, &nData);
pCsr->iPtr = btreeCursorPtr(aData, nData, pBtreePg->iCell+1);
if( pBtreePg->iCell<0 ){
LsmPgno dummy;
int i;
for(i=pCsr->iPg-1; i>=0; i--){
if( pCsr->aPg[i].iCell>0 ) break;
}
assert( i>=0 );
rc = pageGetBtreeKey(pSeg,
pCsr->aPg[i].pPage, pCsr->aPg[i].iCell-1,
&dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
);
pCsr->eType |= LSM_SEPARATOR;
}else{
rc = btreeCursorLoadKey(pCsr);
}
}
}
return rc;
}
static int btreeCursorNew(
lsm_db *pDb,
Segment *pSeg,
BtreeCursor **ppCsr
){
int rc = LSM_OK;
BtreeCursor *pCsr;
assert( pSeg->iRoot );
pCsr = lsmMallocZeroRc(pDb->pEnv, sizeof(BtreeCursor), &rc);
if( pCsr ){
pCsr->pFS = pDb->pFS;
pCsr->pSeg = pSeg;
pCsr->iPg = -1;
}
*ppCsr = pCsr;
return rc;
}
static void segmentPtrSetPage(SegmentPtr *pPtr, Page *pNext){
lsmFsPageRelease(pPtr->pPg);
if( pNext ){
int nData;
u8 *aData = fsPageData(pNext, &nData);
pPtr->nCell = pageGetNRec(aData, nData);
pPtr->flags = (u16)pageGetFlags(aData, nData);
pPtr->iPtr = pageGetPtr(aData, nData);
}
pPtr->pPg = pNext;
}
/*
** Load a new page into the SegmentPtr object pPtr.
*/
static int segmentPtrLoadPage(
FileSystem *pFS,
SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
LsmPgno iNew /* Page number of new page */
){
Page *pPg = 0; /* The new page */
int rc; /* Return Code */
rc = lsmFsDbPageGet(pFS, pPtr->pSeg, iNew, &pPg);
assert( rc==LSM_OK || pPg==0 );
segmentPtrSetPage(pPtr, pPg);
return rc;
}
static int segmentPtrReadData(
SegmentPtr *pPtr,
int iOff,
int nByte,
void **ppData,
LsmBlob *pBlob
){
return sortedReadData(pPtr->pSeg, pPtr->pPg, iOff, nByte, ppData, pBlob);
}
static int segmentPtrNextPage(
SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
int eDir /* +1 for next(), -1 for prev() */
){
Page *pNext; /* New page to load */
int rc; /* Return code */
assert( eDir==1 || eDir==-1 );
assert( pPtr->pPg );
assert( pPtr->pSeg || eDir>0 );
rc = lsmFsDbPageNext(pPtr->pSeg, pPtr->pPg, eDir, &pNext);
assert( rc==LSM_OK || pNext==0 );
segmentPtrSetPage(pPtr, pNext);
return rc;
}
static int segmentPtrLoadCell(
SegmentPtr *pPtr, /* Load page into this SegmentPtr object */
int iNew /* Cell number of new cell */
){
int rc = LSM_OK;
if( pPtr->pPg ){
u8 *aData; /* Pointer to page data buffer */
int iOff; /* Offset in aData[] to read from */
int nPgsz; /* Size of page (aData[]) in bytes */
assert( iNew<pPtr->nCell );
pPtr->iCell = iNew;
aData = fsPageData(pPtr->pPg, &nPgsz);
iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nPgsz, pPtr->iCell)]);
pPtr->eType = aData[iOff];
iOff++;
iOff += GETVARINT64(&aData[iOff], pPtr->iPgPtr);
iOff += GETVARINT32(&aData[iOff], pPtr->nKey);
if( rtIsWrite(pPtr->eType) ){
iOff += GETVARINT32(&aData[iOff], pPtr->nVal);
}
assert( pPtr->nKey>=0 );
rc = segmentPtrReadData(
pPtr, iOff, pPtr->nKey, &pPtr->pKey, &pPtr->blob1
);
if( rc==LSM_OK && rtIsWrite(pPtr->eType) ){
rc = segmentPtrReadData(
pPtr, iOff+pPtr->nKey, pPtr->nVal, &pPtr->pVal, &pPtr->blob2
);
}else{
pPtr->nVal = 0;
pPtr->pVal = 0;
}
}
return rc;
}
static Segment *sortedSplitkeySegment(Level *pLevel){
Merge *pMerge = pLevel->pMerge;
MergeInput *p = &pMerge->splitkey;
Segment *pSeg;
int i;
for(i=0; i<pMerge->nInput; i++){
if( p->iPg==pMerge->aInput[i].iPg ) break;
}
if( pMerge->nInput==(pLevel->nRight+1) && i>=(pMerge->nInput-1) ){
pSeg = &pLevel->pNext->lhs;
}else{
pSeg = &pLevel->aRhs[i];
}
return pSeg;
}
static void sortedSplitkey(lsm_db *pDb, Level *pLevel, int *pRc){
Segment *pSeg;
Page *pPg = 0;
lsm_env *pEnv = pDb->pEnv; /* Environment handle */
int rc = *pRc;
Merge *pMerge = pLevel->pMerge;
pSeg = sortedSplitkeySegment(pLevel);
if( rc==LSM_OK ){
rc = lsmFsDbPageGet(pDb->pFS, pSeg, pMerge->splitkey.iPg, &pPg);
}
if( rc==LSM_OK ){
int iTopic;
LsmBlob blob = {0, 0, 0, 0};
u8 *aData;
int nData;
aData = lsmFsPageData(pPg, &nData);
if( pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG ){
void *pKey;
int nKey;
LsmPgno dummy;
rc = pageGetBtreeKey(pSeg,
pPg, pMerge->splitkey.iCell, &dummy, &iTopic, &pKey, &nKey, &blob
);
if( rc==LSM_OK && blob.pData!=pKey ){
rc = sortedBlobSet(pEnv, &blob, pKey, nKey);
}
}else{
rc = pageGetKeyCopy(
pEnv, pSeg, pPg, pMerge->splitkey.iCell, &iTopic, &blob
);
}
pLevel->iSplitTopic = iTopic;
pLevel->pSplitKey = blob.pData;
pLevel->nSplitKey = blob.nData;
lsmFsPageRelease(pPg);
}
*pRc = rc;
}
/*
** Reset a segment cursor. Also free its buffers if they are nThreshold
** bytes or larger in size.
*/
static void segmentPtrReset(SegmentPtr *pPtr, int nThreshold){
lsmFsPageRelease(pPtr->pPg);
pPtr->pPg = 0;
pPtr->nCell = 0;
pPtr->pKey = 0;
pPtr->nKey = 0;
pPtr->pVal = 0;
pPtr->nVal = 0;
pPtr->eType = 0;
pPtr->iCell = 0;
if( pPtr->blob1.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob1);
if( pPtr->blob2.nAlloc>=nThreshold ) sortedBlobFree(&pPtr->blob2);
}
static int segmentPtrIgnoreSeparators(MultiCursor *pCsr, SegmentPtr *pPtr){
return (pCsr->flags & CURSOR_READ_SEPARATORS)==0
|| (pPtr!=&pCsr->aPtr[pCsr->nPtr-1]);
}
static int segmentPtrAdvance(
MultiCursor *pCsr,
SegmentPtr *pPtr,
int bReverse
){
int eDir = (bReverse ? -1 : 1);
Level *pLvl = pPtr->pLevel;
do {
int rc;
int iCell; /* Number of new cell in page */
int svFlags = 0; /* SegmentPtr.eType before advance */
iCell = pPtr->iCell + eDir;
assert( pPtr->pPg );
assert( iCell<=pPtr->nCell && iCell>=-1 );
if( bReverse && pPtr->pSeg!=&pPtr->pLevel->lhs ){
svFlags = pPtr->eType;
assert( svFlags );
}
if( iCell>=pPtr->nCell || iCell<0 ){
do {
rc = segmentPtrNextPage(pPtr, eDir);
}while( rc==LSM_OK
&& pPtr->pPg
&& (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG) )
);
if( rc!=LSM_OK ) return rc;
iCell = bReverse ? (pPtr->nCell-1) : 0;
}
rc = segmentPtrLoadCell(pPtr, iCell);
if( rc!=LSM_OK ) return rc;
if( svFlags && pPtr->pPg ){
int res = sortedKeyCompare(pCsr->pDb->xCmp,
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
);
if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
}
if( pPtr->pPg==0 && (svFlags & LSM_END_DELETE) ){
Segment *pSeg = pPtr->pSeg;
rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, pSeg->iFirst, &pPtr->pPg);
if( rc!=LSM_OK ) return rc;
pPtr->eType = LSM_START_DELETE | LSM_POINT_DELETE;
pPtr->eType |= (pLvl->iSplitTopic ? LSM_SYSTEMKEY : 0);
pPtr->pKey = pLvl->pSplitKey;
pPtr->nKey = pLvl->nSplitKey;
}
}while( pCsr
&& pPtr->pPg
&& segmentPtrIgnoreSeparators(pCsr, pPtr)
&& rtIsSeparator(pPtr->eType)
);
return LSM_OK;
}
static void segmentPtrEndPage(
FileSystem *pFS,
SegmentPtr *pPtr,
int bLast,
int *pRc
){
if( *pRc==LSM_OK ){
Segment *pSeg = pPtr->pSeg;
Page *pNew = 0;
if( bLast ){
*pRc = lsmFsDbPageLast(pFS, pSeg, &pNew);
}else{
*pRc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pNew);
}
segmentPtrSetPage(pPtr, pNew);
}
}
/*
** Try to move the segment pointer passed as the second argument so that it
** points at either the first (bLast==0) or last (bLast==1) cell in the valid
** region of the segment defined by pPtr->iFirst and pPtr->iLast.
**
** Return LSM_OK if successful or an lsm error code if something goes
** wrong (IO error, OOM etc.).
*/
static int segmentPtrEnd(MultiCursor *pCsr, SegmentPtr *pPtr, int bLast){
Level *pLvl = pPtr->pLevel;
int rc = LSM_OK;
FileSystem *pFS = pCsr->pDb->pFS;
int bIgnore;
segmentPtrEndPage(pFS, pPtr, bLast, &rc);
while( rc==LSM_OK && pPtr->pPg
&& (pPtr->nCell==0 || (pPtr->flags & SEGMENT_BTREE_FLAG))
){
rc = segmentPtrNextPage(pPtr, (bLast ? -1 : 1));
}
if( rc==LSM_OK && pPtr->pPg ){
rc = segmentPtrLoadCell(pPtr, bLast ? (pPtr->nCell-1) : 0);
if( rc==LSM_OK && bLast && pPtr->pSeg!=&pLvl->lhs ){
int res = sortedKeyCompare(pCsr->pDb->xCmp,
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
);
if( res<0 ) segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
}
}
bIgnore = segmentPtrIgnoreSeparators(pCsr, pPtr);
if( rc==LSM_OK && pPtr->pPg && bIgnore && rtIsSeparator(pPtr->eType) ){
rc = segmentPtrAdvance(pCsr, pPtr, bLast);
}
#if 0
if( bLast && rc==LSM_OK && pPtr->pPg
&& pPtr->pSeg==&pLvl->lhs
&& pLvl->nRight && (pPtr->eType & LSM_START_DELETE)
){
pPtr->iCell++;
pPtr->eType = LSM_END_DELETE | (pLvl->iSplitTopic);
pPtr->pKey = pLvl->pSplitKey;
pPtr->nKey = pLvl->nSplitKey;
pPtr->pVal = 0;
pPtr->nVal = 0;
}
#endif
return rc;
}
static void segmentPtrKey(SegmentPtr *pPtr, void **ppKey, int *pnKey){
assert( pPtr->pPg );
*ppKey = pPtr->pKey;
*pnKey = pPtr->nKey;
}
#if 0 /* NOT USED */
static char *keyToString(lsm_env *pEnv, void *pKey, int nKey){
int i;
u8 *aKey = (u8 *)pKey;
char *zRet = (char *)lsmMalloc(pEnv, nKey+1);
for(i=0; i<nKey; i++){
zRet[i] = (char)(isalnum(aKey[i]) ? aKey[i] : '.');
}
zRet[nKey] = '\0';
return zRet;
}
#endif
#if 0 /* NOT USED */
/*
** Check that the page that pPtr currently has loaded is the correct page
** to search for key (pKey/nKey). If it is, return 1. Otherwise, an assert
** fails and this function does not return.
*/
static int assertKeyLocation(
MultiCursor *pCsr,
SegmentPtr *pPtr,
void *pKey, int nKey
){
lsm_env *pEnv = lsmFsEnv(pCsr->pDb->pFS);
LsmBlob blob = {0, 0, 0};
int eDir;
int iTopic = 0; /* TODO: Fix me */
for(eDir=-1; eDir<=1; eDir+=2){
Page *pTest = pPtr->pPg;
lsmFsPageRef(pTest);
while( pTest ){
Segment *pSeg = pPtr->pSeg;
Page *pNext;
int rc = lsmFsDbPageNext(pSeg, pTest, eDir, &pNext);
lsmFsPageRelease(pTest);
if( rc ) return 1;
pTest = pNext;
if( pTest ){
int nData;
u8 *aData = fsPageData(pTest, &nData);
int nCell = pageGetNRec(aData, nData);
int flags = pageGetFlags(aData, nData);
if( nCell && 0==(flags&SEGMENT_BTREE_FLAG) ){
int nPgKey;
int iPgTopic;
u8 *pPgKey;
int res;
int iCell;
iCell = ((eDir < 0) ? (nCell-1) : 0);
pPgKey = pageGetKey(pSeg, pTest, iCell, &iPgTopic, &nPgKey, &blob);
res = iTopic - iPgTopic;
if( res==0 ) res = pCsr->pDb->xCmp(pKey, nKey, pPgKey, nPgKey);
if( (eDir==1 && res>0) || (eDir==-1 && res<0) ){
/* Taking this branch means something has gone wrong. */
char *zMsg = lsmMallocPrintf(pEnv, "Key \"%s\" is not on page %d",
keyToString(pEnv, pKey, nKey), lsmFsPageNumber(pPtr->pPg)
);
fprintf(stderr, "%s\n", zMsg);
assert( !"assertKeyLocation() failed" );
}
lsmFsPageRelease(pTest);
pTest = 0;
}
}
}
}
sortedBlobFree(&blob);
return 1;
}
#endif
#ifndef NDEBUG
static int assertSeekResult(
MultiCursor *pCsr,
SegmentPtr *pPtr,
int iTopic,
void *pKey,
int nKey,
int eSeek
){
if( pPtr->pPg ){
int res;
res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey,
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey
);
if( eSeek==LSM_SEEK_EQ ) return (res==0);
if( eSeek==LSM_SEEK_LE ) return (res>=0);
if( eSeek==LSM_SEEK_GE ) return (res<=0);
}
return 1;
}
#endif
static int segmentPtrSearchOversized(
MultiCursor *pCsr, /* Cursor context */
SegmentPtr *pPtr, /* Pointer to seek */
int iTopic, /* Topic of key to search for */
void *pKey, int nKey /* Key to seek to */
){
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
int rc = LSM_OK;
/* If the OVERSIZED flag is set, then there is no pointer in the
** upper level to the next page in the segment that contains at least
** one key. So compare the largest key on the current page with the
** key being sought (pKey/nKey). If (pKey/nKey) is larger, advance
** to the next page in the segment that contains at least one key.
*/
while( rc==LSM_OK && (pPtr->flags & PGFTR_SKIP_NEXT_FLAG) ){
u8 *pLastKey;
int nLastKey;
int iLastTopic;
int res; /* Result of comparison */
Page *pNext;
/* Load the last key on the current page. */
pLastKey = pageGetKey(pPtr->pSeg,
pPtr->pPg, pPtr->nCell-1, &iLastTopic, &nLastKey, &pPtr->blob1
);
/* If the loaded key is >= than (pKey/nKey), break out of the loop.
** If (pKey/nKey) is present in this array, it must be on the current
** page. */
res = sortedKeyCompare(
xCmp, iLastTopic, pLastKey, nLastKey, iTopic, pKey, nKey
);
if( res>=0 ) break;
/* Advance to the next page that contains at least one key. */
pNext = pPtr->pPg;
lsmFsPageRef(pNext);
while( 1 ){
Page *pLoad;
u8 *aData; int nData;
rc = lsmFsDbPageNext(pPtr->pSeg, pNext, 1, &pLoad);
lsmFsPageRelease(pNext);
pNext = pLoad;
if( pNext==0 ) break;
assert( rc==LSM_OK );
aData = lsmFsPageData(pNext, &nData);
if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0
&& pageGetNRec(aData, nData)>0
){
break;
}
}
if( pNext==0 ) break;
segmentPtrSetPage(pPtr, pNext);
/* This should probably be an LSM_CORRUPT error. */
assert( rc!=LSM_OK || (pPtr->flags & PGFTR_SKIP_THIS_FLAG) );
}
return rc;
}
static int ptrFwdPointer(
Page *pPage,
int iCell,
Segment *pSeg,
LsmPgno *piPtr,
int *pbFound
){
Page *pPg = pPage;
int iFirst = iCell;
int rc = LSM_OK;
do {
Page *pNext = 0;
u8 *aData;
int nData;
aData = lsmFsPageData(pPg, &nData);
if( (pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG)==0 ){
int i;
int nCell = pageGetNRec(aData, nData);
for(i=iFirst; i<nCell; i++){
u8 eType = *pageGetCell(aData, nData, i);
if( (eType & LSM_START_DELETE)==0 ){
*pbFound = 1;
*piPtr = pageGetRecordPtr(aData, nData, i) + pageGetPtr(aData, nData);
lsmFsPageRelease(pPg);
return LSM_OK;
}
}
}
rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
lsmFsPageRelease(pPg);
pPg = pNext;
iFirst = 0;
}while( pPg && rc==LSM_OK );
lsmFsPageRelease(pPg);
*pbFound = 0;
return rc;
}
static int sortedRhsFirst(MultiCursor *pCsr, Level *pLvl, SegmentPtr *pPtr){
int rc;
rc = segmentPtrEnd(pCsr, pPtr, 0);
while( pPtr->pPg && rc==LSM_OK ){
int res = sortedKeyCompare(pCsr->pDb->xCmp,
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey,
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey
);
if( res<=0 ) break;
rc = segmentPtrAdvance(pCsr, pPtr, 0);
}
return rc;
}
/*
** This function is called as part of a SEEK_GE op on a multi-cursor if the
** FC pointer read from segment *pPtr comes from an entry with the
** LSM_START_DELETE flag set. In this case the pointer value cannot be
** trusted. Instead, the pointer that should be followed is that associated
** with the next entry in *pPtr that does not have LSM_START_DELETE set.
**
** Why the pointers can't be trusted:
**
**
**
** TODO: This is a stop-gap solution:
**
** At the moment, this function is called from within segmentPtrSeek(),
** as part of the initial lsmMCursorSeek() call. However, consider a
** database where the following has occurred:
**
** 1. A range delete removes keys 1..9999 using a range delete.
** 2. Keys 1 through 9999 are reinserted.
** 3. The levels containing the ops in 1. and 2. above are merged. Call
** this level N. Level N contains FC pointers to level N+1.
**
** Then, if the user attempts to query for (key>=2 LIMIT 10), the
** lsmMCursorSeek() call will iterate through 9998 entries searching for a
** pointer down to the level N+1 that is never actually used. It would be
** much better if the multi-cursor could do this lazily - only seek to the
** level (N+1) page after the user has moved the cursor on level N passed
** the big range-delete.
*/
static int segmentPtrFwdPointer(
MultiCursor *pCsr, /* Multi-cursor pPtr belongs to */
SegmentPtr *pPtr, /* Segment-pointer to extract FC ptr from */
LsmPgno *piPtr /* OUT: FC pointer value */
){
Level *pLvl = pPtr->pLevel;
Level *pNext = pLvl->pNext;
Page *pPg = pPtr->pPg;
int rc;
int bFound;
LsmPgno iOut = 0;
if( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[pLvl->nRight-1] ){
if( pNext==0
|| (pNext->nRight==0 && pNext->lhs.iRoot)
|| (pNext->nRight!=0 && pNext->aRhs[0].iRoot)
){
/* Do nothing. The pointer will not be used anyway. */
return LSM_OK;
}
}else{
if( pPtr[1].pSeg->iRoot ){
return LSM_OK;
}
}
/* Search for a pointer within the current segment. */
lsmFsPageRef(pPg);
rc = ptrFwdPointer(pPg, pPtr->iCell, pPtr->pSeg, &iOut, &bFound);
if( rc==LSM_OK && bFound==0 ){
/* This case happens when pPtr points to the left-hand-side of a segment
** currently undergoing an incremental merge. In this case, jump to the
** oldest segment in the right-hand-side of the same level and continue
** searching. But - do not consider any keys smaller than the levels
** split-key. */
SegmentPtr ptr;
if( pPtr->pLevel->nRight==0 || pPtr->pSeg!=&pPtr->pLevel->lhs ){
return LSM_CORRUPT_BKPT;
}
memset(&ptr, 0, sizeof(SegmentPtr));
ptr.pLevel = pPtr->pLevel;
ptr.pSeg = &ptr.pLevel->aRhs[ptr.pLevel->nRight-1];
rc = sortedRhsFirst(pCsr, ptr.pLevel, &ptr);
if( rc==LSM_OK ){
rc = ptrFwdPointer(ptr.pPg, ptr.iCell, ptr.pSeg, &iOut, &bFound);
ptr.pPg = 0;
}
segmentPtrReset(&ptr, 0);
}
*piPtr = iOut;
return rc;
}
static int segmentPtrSeek(
MultiCursor *pCsr, /* Cursor context */
SegmentPtr *pPtr, /* Pointer to seek */
int iTopic, /* Key topic to seek to */
void *pKey, int nKey, /* Key to seek to */
int eSeek, /* Search bias - see above */
LsmPgno *piPtr, /* OUT: FC pointer */
int *pbStop
){
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
int res = 0; /* Result of comparison operation */
int rc = LSM_OK;
int iMin;
int iMax;
LsmPgno iPtrOut = 0;
/* If the current page contains an oversized entry, then there are no
** pointers to one or more of the subsequent pages in the sorted run.
** The following call ensures that the segment-ptr points to the correct
** page in this case. */
rc = segmentPtrSearchOversized(pCsr, pPtr, iTopic, pKey, nKey);
iPtrOut = pPtr->iPtr;
/* Assert that this page is the right page of this segment for the key
** that we are searching for. Do this by loading page (iPg-1) and testing
** that pKey/nKey is greater than all keys on that page, and then by
** loading (iPg+1) and testing that pKey/nKey is smaller than all
** the keys it houses.
**
** TODO: With range-deletes in the tree, the test described above may fail.
*/
#if 0
assert( assertKeyLocation(pCsr, pPtr, pKey, nKey) );
#endif
assert( pPtr->nCell>0
|| pPtr->pSeg->nSize==1
|| lsmFsDbPageIsLast(pPtr->pSeg, pPtr->pPg)
);
if( pPtr->nCell==0 ){
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
}else{
iMin = 0;
iMax = pPtr->nCell-1;
while( 1 ){
int iTry = (iMin+iMax)/2;
void *pKeyT; int nKeyT; /* Key for cell iTry */
int iTopicT;
assert( iTry<iMax || iMin==iMax );
rc = segmentPtrLoadCell(pPtr, iTry);
if( rc!=LSM_OK ) break;
segmentPtrKey(pPtr, &pKeyT, &nKeyT);
iTopicT = rtTopic(pPtr->eType);
res = sortedKeyCompare(xCmp, iTopicT, pKeyT, nKeyT, iTopic, pKey, nKey);
if( res<=0 ){
iPtrOut = pPtr->iPtr + pPtr->iPgPtr;
}
if( res==0 || iMin==iMax ){
break;
}else if( res>0 ){
iMax = LSM_MAX(iTry-1, iMin);
}else{
iMin = iTry+1;
}
}
if( rc==LSM_OK ){
assert( res==0 || (iMin==iMax && iMin>=0 && iMin<pPtr->nCell) );
if( res ){
rc = segmentPtrLoadCell(pPtr, iMin);
}
assert( rc!=LSM_OK || res>0 || iPtrOut==(pPtr->iPtr + pPtr->iPgPtr) );
if( rc==LSM_OK ){
switch( eSeek ){
case LSM_SEEK_EQ: {
int eType = pPtr->eType;
if( (res<0 && (eType & LSM_START_DELETE))
|| (res>0 && (eType & LSM_END_DELETE))
|| (res==0 && (eType & LSM_POINT_DELETE))
){
*pbStop = 1;
}else if( res==0 && (eType & LSM_INSERT) ){
lsm_env *pEnv = pCsr->pDb->pEnv;
*pbStop = 1;
pCsr->eType = pPtr->eType;
rc = sortedBlobSet(pEnv, &pCsr->key, pPtr->pKey, pPtr->nKey);
if( rc==LSM_OK ){
rc = sortedBlobSet(pEnv, &pCsr->val, pPtr->pVal, pPtr->nVal);
}
pCsr->flags |= CURSOR_SEEK_EQ;
}
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
break;
}
case LSM_SEEK_LE:
if( res>0 ) rc = segmentPtrAdvance(pCsr, pPtr, 1);
break;
case LSM_SEEK_GE: {
/* Figure out if we need to 'skip' the pointer forward or not */
if( (res<=0 && (pPtr->eType & LSM_START_DELETE))
|| (res>0 && (pPtr->eType & LSM_END_DELETE))
){
rc = segmentPtrFwdPointer(pCsr, pPtr, &iPtrOut);
}
if( res<0 && rc==LSM_OK ){
rc = segmentPtrAdvance(pCsr, pPtr, 0);
}
break;
}
}
}
}
/* If the cursor seek has found a separator key, and this cursor is
** supposed to ignore separators keys, advance to the next entry. */
if( rc==LSM_OK && pPtr->pPg
&& segmentPtrIgnoreSeparators(pCsr, pPtr)
&& rtIsSeparator(pPtr->eType)
){
assert( eSeek!=LSM_SEEK_EQ );
rc = segmentPtrAdvance(pCsr, pPtr, eSeek==LSM_SEEK_LE);
}
}
assert( rc!=LSM_OK || assertSeekResult(pCsr,pPtr,iTopic,pKey,nKey,eSeek) );
*piPtr = iPtrOut;
return rc;
}
static int seekInBtree(
MultiCursor *pCsr, /* Multi-cursor object */
Segment *pSeg, /* Seek within this segment */
int iTopic,
void *pKey, int nKey, /* Key to seek to */
LsmPgno *aPg, /* OUT: Page numbers */
Page **ppPg /* OUT: Leaf (sorted-run) page reference */
){
int i = 0;
int rc;
LsmPgno iPg;
Page *pPg = 0;
LsmBlob blob = {0, 0, 0};
iPg = pSeg->iRoot;
do {
LsmPgno *piFirst = 0;
if( aPg ){
aPg[i++] = iPg;
piFirst = &aPg[i];
}
rc = lsmFsDbPageGet(pCsr->pDb->pFS, pSeg, iPg, &pPg);
assert( rc==LSM_OK || pPg==0 );
if( rc==LSM_OK ){
u8 *aData; /* Buffer containing page data */
int nData; /* Size of aData[] in bytes */
int iMin;
int iMax;
int nRec;
int flags;
aData = fsPageData(pPg, &nData);
flags = pageGetFlags(aData, nData);
if( (flags & SEGMENT_BTREE_FLAG)==0 ) break;
iPg = pageGetPtr(aData, nData);
nRec = pageGetNRec(aData, nData);
iMin = 0;
iMax = nRec-1;
while( iMax>=iMin ){
int iTry = (iMin+iMax)/2;
void *pKeyT; int nKeyT; /* Key for cell iTry */
int iTopicT; /* Topic for key pKeyT/nKeyT */
LsmPgno iPtr; /* Pointer associated with cell iTry */
int res; /* (pKey - pKeyT) */
rc = pageGetBtreeKey(
pSeg, pPg, iTry, &iPtr, &iTopicT, &pKeyT, &nKeyT, &blob
);
if( rc!=LSM_OK ) break;
if( piFirst && pKeyT==blob.pData ){
*piFirst = pageGetBtreeRef(pPg, iTry);
piFirst = 0;
i++;
}
res = sortedKeyCompare(
pCsr->pDb->xCmp, iTopic, pKey, nKey, iTopicT, pKeyT, nKeyT
);
if( res<0 ){
iPg = iPtr;
iMax = iTry-1;
}else{
iMin = iTry+1;
}
}
lsmFsPageRelease(pPg);
pPg = 0;
}
}while( rc==LSM_OK );
sortedBlobFree(&blob);
assert( (rc==LSM_OK)==(pPg!=0) );
if( ppPg ){
*ppPg = pPg;
}else{
lsmFsPageRelease(pPg);
}
return rc;
}
static int seekInSegment(
MultiCursor *pCsr,
SegmentPtr *pPtr,
int iTopic,
void *pKey, int nKey,
LsmPgno iPg, /* Page to search */
int eSeek, /* Search bias - see above */
LsmPgno *piPtr, /* OUT: FC pointer */
int *pbStop /* OUT: Stop search flag */
){
LsmPgno iPtr = iPg;
int rc = LSM_OK;
if( pPtr->pSeg->iRoot ){
Page *pPg;
assert( pPtr->pSeg->iRoot!=0 );
rc = seekInBtree(pCsr, pPtr->pSeg, iTopic, pKey, nKey, 0, &pPg);
if( rc==LSM_OK ) segmentPtrSetPage(pPtr, pPg);
}else{
if( iPtr==0 ){
iPtr = pPtr->pSeg->iFirst;
}
if( rc==LSM_OK ){
rc = segmentPtrLoadPage(pCsr->pDb->pFS, pPtr, iPtr);
}
}
if( rc==LSM_OK ){
rc = segmentPtrSeek(pCsr, pPtr, iTopic, pKey, nKey, eSeek, piPtr, pbStop);
}
return rc;
}
/*
** Seek each segment pointer in the array of (pLvl->nRight+1) at aPtr[].
**
** pbStop:
** This parameter is only significant if parameter eSeek is set to
** LSM_SEEK_EQ. In this case, it is set to true before returning if
** the seek operation is finished. This can happen in two ways:
**
** a) A key matching (pKey/nKey) is found, or
** b) A point-delete or range-delete deleting the key is found.
**
** In case (a), the multi-cursor CURSOR_SEEK_EQ flag is set and the pCsr->key
** and pCsr->val blobs populated before returning.
*/
static int seekInLevel(
MultiCursor *pCsr, /* Sorted cursor object to seek */
SegmentPtr *aPtr, /* Pointer to array of (nRhs+1) SPs */
int eSeek, /* Search bias - see above */
int iTopic, /* Key topic to search for */
void *pKey, int nKey, /* Key to search for */
LsmPgno *piPgno, /* IN/OUT: fraction cascade pointer (or 0) */
int *pbStop /* OUT: See above */
){
Level *pLvl = aPtr[0].pLevel; /* Level to seek within */
int rc = LSM_OK; /* Return code */
LsmPgno iOut = 0; /* Pointer to return to caller */
int res = -1; /* Result of xCmp(pKey, split) */
int nRhs = pLvl->nRight; /* Number of right-hand-side segments */
int bStop = 0;
/* If this is a composite level (one currently undergoing an incremental
** merge), figure out if the search key is larger or smaller than the
** levels split-key. */
if( nRhs ){
res = sortedKeyCompare(pCsr->pDb->xCmp, iTopic, pKey, nKey,
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
);
}
/* If (res<0), then key pKey/nKey is smaller than the split-key (or this
** is not a composite level and there is no split-key). Search the
** left-hand-side of the level in this case. */
if( res<0 ){
int i;
LsmPgno iPtr = 0;
if( nRhs==0 ) iPtr = *piPgno;
rc = seekInSegment(
pCsr, &aPtr[0], iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop
);
if( rc==LSM_OK && nRhs>0 && eSeek==LSM_SEEK_GE && aPtr[0].pPg==0 ){
res = 0;
}
for(i=1; i<=nRhs; i++){
segmentPtrReset(&aPtr[i], LSM_SEGMENTPTR_FREE_THRESHOLD);
}
}
if( res>=0 ){
int bHit = 0; /* True if at least one rhs is not EOF */
LsmPgno iPtr = *piPgno;
int i;
segmentPtrReset(&aPtr[0], LSM_SEGMENTPTR_FREE_THRESHOLD);
for(i=1; rc==LSM_OK && i<=nRhs && bStop==0; i++){
SegmentPtr *pPtr = &aPtr[i];
iOut = 0;
rc = seekInSegment(
pCsr, pPtr, iTopic, pKey, nKey, iPtr, eSeek, &iOut, &bStop
);
iPtr = iOut;
/* If the segment-pointer has settled on a key that is smaller than
** the splitkey, invalidate the segment-pointer. */
if( pPtr->pPg ){
res = sortedKeyCompare(pCsr->pDb->xCmp,
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
);
if( res<0 ){
if( pPtr->eType & LSM_START_DELETE ){
pPtr->eType &= ~LSM_INSERT;
pPtr->pKey = pLvl->pSplitKey;
pPtr->nKey = pLvl->nSplitKey;
pPtr->pVal = 0;
pPtr->nVal = 0;
}else{
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
}
}
}
if( aPtr[i].pKey ) bHit = 1;
}
if( rc==LSM_OK && eSeek==LSM_SEEK_LE && bHit==0 ){
rc = segmentPtrEnd(pCsr, &aPtr[0], 1);
}
}
assert( eSeek==LSM_SEEK_EQ || bStop==0 );
*piPgno = iOut;
*pbStop = bStop;
return rc;
}
static void multiCursorGetKey(
MultiCursor *pCsr,
int iKey,
int *peType, /* OUT: Key type (SORTED_WRITE etc.) */
void **ppKey, /* OUT: Pointer to buffer containing key */
int *pnKey /* OUT: Size of *ppKey in bytes */
){
int nKey = 0;
void *pKey = 0;
int eType = 0;
switch( iKey ){
case CURSOR_DATA_TREE0:
case CURSOR_DATA_TREE1: {
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
if( lsmTreeCursorValid(pTreeCsr) ){
lsmTreeCursorKey(pTreeCsr, &eType, &pKey, &nKey);
}
break;
}
case CURSOR_DATA_SYSTEM: {
Snapshot *pWorker = pCsr->pDb->pWorker;
if( pWorker && (pCsr->flags & CURSOR_FLUSH_FREELIST) ){
int nEntry = pWorker->freelist.nEntry;
if( pCsr->iFree < (nEntry*2) ){
FreelistEntry *aEntry = pWorker->freelist.aEntry;
int i = nEntry - 1 - (pCsr->iFree / 2);
u32 iKey2 = 0;
if( (pCsr->iFree % 2) ){
eType = LSM_END_DELETE|LSM_SYSTEMKEY;
iKey2 = aEntry[i].iBlk-1;
}else if( aEntry[i].iId>=0 ){
eType = LSM_INSERT|LSM_SYSTEMKEY;
iKey2 = aEntry[i].iBlk;
/* If the in-memory entry immediately before this one was a
** DELETE, and the block number is one greater than the current
** block number, mark this entry as an "end-delete-range". */
if( i<(nEntry-1) && aEntry[i+1].iBlk==iKey2+1 && aEntry[i+1].iId<0 ){
eType |= LSM_END_DELETE;
}
}else{
eType = LSM_START_DELETE|LSM_SYSTEMKEY;
iKey2 = aEntry[i].iBlk + 1;
}
/* If the in-memory entry immediately after this one is a
** DELETE, and the block number is one less than the current
** key, mark this entry as an "start-delete-range". */
if( i>0 && aEntry[i-1].iBlk==iKey2-1 && aEntry[i-1].iId<0 ){
eType |= LSM_START_DELETE;
}
pKey = pCsr->pSystemVal;
nKey = 4;
lsmPutU32(pKey, ~iKey2);
}
}
break;
}
default: {
int iPtr = iKey - CURSOR_DATA_SEGMENT;
assert( iPtr>=0 );
if( iPtr==pCsr->nPtr ){
if( pCsr->pBtCsr ){
pKey = pCsr->pBtCsr->pKey;
nKey = pCsr->pBtCsr->nKey;
eType = pCsr->pBtCsr->eType;
}
}else if( iPtr<pCsr->nPtr ){
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
if( pPtr->pPg ){
pKey = pPtr->pKey;
nKey = pPtr->nKey;
eType = pPtr->eType;
}
}
break;
}
}
if( peType ) *peType = eType;
if( pnKey ) *pnKey = nKey;
if( ppKey ) *ppKey = pKey;
}
static int sortedDbKeyCompare(
MultiCursor *pCsr,
int iLhsFlags, void *pLhsKey, int nLhsKey,
int iRhsFlags, void *pRhsKey, int nRhsKey
){
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
int res;
/* Compare the keys, including the system flag. */
res = sortedKeyCompare(xCmp,
rtTopic(iLhsFlags), pLhsKey, nLhsKey,
rtTopic(iRhsFlags), pRhsKey, nRhsKey
);
/* If a key has the LSM_START_DELETE flag set, but not the LSM_INSERT or
** LSM_POINT_DELETE flags, it is considered a delta larger. This prevents
** the beginning of an open-ended set from masking a database entry or
** delete at a lower level. */
if( res==0 && (pCsr->flags & CURSOR_IGNORE_DELETE) ){
const int m = LSM_POINT_DELETE|LSM_INSERT|LSM_END_DELETE |LSM_START_DELETE;
int iDel1 = 0;
int iDel2 = 0;
if( LSM_START_DELETE==(iLhsFlags & m) ) iDel1 = +1;
if( LSM_END_DELETE ==(iLhsFlags & m) ) iDel1 = -1;
if( LSM_START_DELETE==(iRhsFlags & m) ) iDel2 = +1;
if( LSM_END_DELETE ==(iRhsFlags & m) ) iDel2 = -1;
res = (iDel1 - iDel2);
}
return res;
}
static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){
int i1;
int i2;
int iRes;
void *pKey1; int nKey1; int eType1;
void *pKey2; int nKey2; int eType2;
const int mul = (bReverse ? -1 : 1);
assert( pCsr->aTree && iOut<pCsr->nTree );
if( iOut>=(pCsr->nTree/2) ){
i1 = (iOut - pCsr->nTree/2) * 2;
i2 = i1 + 1;
}else{
i1 = pCsr->aTree[iOut*2];
i2 = pCsr->aTree[iOut*2+1];
}
multiCursorGetKey(pCsr, i1, &eType1, &pKey1, &nKey1);
multiCursorGetKey(pCsr, i2, &eType2, &pKey2, &nKey2);
if( pKey1==0 ){
iRes = i2;
}else if( pKey2==0 ){
iRes = i1;
}else{
int res;
/* Compare the keys */
res = sortedDbKeyCompare(pCsr,
eType1, pKey1, nKey1, eType2, pKey2, nKey2
);
res = res * mul;
if( res==0 ){
/* The two keys are identical. Normally, this means that the key from
** the newer run clobbers the old. However, if the newer key is a
** separator key, or a range-delete-boundary only, do not allow it
** to clobber an older entry. */
int nc1 = (eType1 & (LSM_INSERT|LSM_POINT_DELETE))==0;
int nc2 = (eType2 & (LSM_INSERT|LSM_POINT_DELETE))==0;
iRes = (nc1 > nc2) ? i2 : i1;
}else if( res<0 ){
iRes = i1;
}else{
iRes = i2;
}
}
pCsr->aTree[iOut] = iRes;
}
/*
** This function advances segment pointer iPtr belonging to multi-cursor
** pCsr forward (bReverse==0) or backward (bReverse!=0).
**
** If the segment pointer points to a segment that is part of a composite
** level, then the following special case is handled.
**
** * If iPtr is the lhs of a composite level, and the cursor is being
** advanced forwards, and segment iPtr is at EOF, move all pointers
** that correspond to rhs segments of the same level to the first
** key in their respective data.
*/
static int segmentCursorAdvance(
MultiCursor *pCsr,
int iPtr,
int bReverse
){
int rc;
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
Level *pLvl = pPtr->pLevel;
int bComposite; /* True if pPtr is part of composite level */
/* Advance the segment-pointer object. */
rc = segmentPtrAdvance(pCsr, pPtr, bReverse);
if( rc!=LSM_OK ) return rc;
bComposite = (pLvl->nRight>0 && pCsr->nPtr>pLvl->nRight);
if( bComposite && pPtr->pPg==0 ){
int bFix = 0;
if( (bReverse==0)==(pPtr->pSeg==&pLvl->lhs) ){
int i;
if( bReverse ){
SegmentPtr *pLhs = &pCsr->aPtr[iPtr - 1 - (pPtr->pSeg - pLvl->aRhs)];
for(i=0; i<pLvl->nRight; i++){
if( pLhs[i+1].pPg ) break;
}
if( i==pLvl->nRight ){
bFix = 1;
rc = segmentPtrEnd(pCsr, pLhs, 1);
}
}else{
bFix = 1;
for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){
rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]);
}
}
}
if( bFix ){
int i;
for(i=pCsr->nTree-1; i>0; i--){
multiCursorDoCompare(pCsr, i, bReverse);
}
}
}
#if 0
if( bComposite && pPtr->pSeg==&pLvl->lhs /* lhs of composite level */
&& bReverse==0 /* csr advanced forwards */
&& pPtr->pPg==0 /* segment at EOF */
){
int i;
for(i=0; rc==LSM_OK && i<pLvl->nRight; i++){
rc = sortedRhsFirst(pCsr, pLvl, &pCsr->aPtr[iPtr+1+i]);
}
for(i=pCsr->nTree-1; i>0; i--){
multiCursorDoCompare(pCsr, i, 0);
}
}
#endif
return rc;
}
static void mcursorFreeComponents(MultiCursor *pCsr){
int i;
lsm_env *pEnv = pCsr->pDb->pEnv;
/* Close the tree cursor, if any. */
lsmTreeCursorDestroy(pCsr->apTreeCsr[0]);
lsmTreeCursorDestroy(pCsr->apTreeCsr[1]);
/* Reset the segment pointers */
for(i=0; i<pCsr->nPtr; i++){
segmentPtrReset(&pCsr->aPtr[i], 0);
}
/* And the b-tree cursor, if any */
btreeCursorFree(pCsr->pBtCsr);
/* Free allocations */
lsmFree(pEnv, pCsr->aPtr);
lsmFree(pEnv, pCsr->aTree);
lsmFree(pEnv, pCsr->pSystemVal);
/* Zero fields */
pCsr->nPtr = 0;
pCsr->aPtr = 0;
pCsr->nTree = 0;
pCsr->aTree = 0;
pCsr->pSystemVal = 0;
pCsr->apTreeCsr[0] = 0;
pCsr->apTreeCsr[1] = 0;
pCsr->pBtCsr = 0;
}
void lsmMCursorFreeCache(lsm_db *pDb){
MultiCursor *p;
MultiCursor *pNext;
for(p=pDb->pCsrCache; p; p=pNext){
pNext = p->pNext;
lsmMCursorClose(p, 0);
}
pDb->pCsrCache = 0;
}
/*
** Close the cursor passed as the first argument.
**
** If the bCache parameter is true, then shift the cursor to the pCsrCache
** list for possible reuse instead of actually deleting it.
*/
void lsmMCursorClose(MultiCursor *pCsr, int bCache){
if( pCsr ){
lsm_db *pDb = pCsr->pDb;
MultiCursor **pp; /* Iterator variable */
/* The cursor may or may not be currently part of the linked list
** starting at lsm_db.pCsr. If it is, extract it. */
for(pp=&pDb->pCsr; *pp; pp=&((*pp)->pNext)){
if( *pp==pCsr ){
*pp = pCsr->pNext;
break;
}
}
if( bCache ){
int i; /* Used to iterate through segment-pointers */
/* Release any page references held by this cursor. */
assert( !pCsr->pBtCsr );
for(i=0; i<pCsr->nPtr; i++){
SegmentPtr *pPtr = &pCsr->aPtr[i];
lsmFsPageRelease(pPtr->pPg);
pPtr->pPg = 0;
}
/* Reset the tree cursors */
lsmTreeCursorReset(pCsr->apTreeCsr[0]);
lsmTreeCursorReset(pCsr->apTreeCsr[1]);
/* Add the cursor to the pCsrCache list */
pCsr->pNext = pDb->pCsrCache;
pDb->pCsrCache = pCsr;
}else{
/* Free the allocation used to cache the current key, if any. */
sortedBlobFree(&pCsr->key);
sortedBlobFree(&pCsr->val);
/* Free the component cursors */
mcursorFreeComponents(pCsr);
/* Free the cursor structure itself */
lsmFree(pDb->pEnv, pCsr);
}
}
}
#define TREE_NONE 0
#define TREE_OLD 1
#define TREE_BOTH 2
/*
** Parameter eTree is one of TREE_OLD or TREE_BOTH.
*/
static int multiCursorAddTree(MultiCursor *pCsr, Snapshot *pSnap, int eTree){
int rc = LSM_OK;
lsm_db *db = pCsr->pDb;
/* Add a tree cursor on the 'old' tree, if it exists. */
if( eTree!=TREE_NONE
&& lsmTreeHasOld(db)
&& db->treehdr.iOldLog!=pSnap->iLogOff
){
rc = lsmTreeCursorNew(db, 1, &pCsr->apTreeCsr[1]);
}
/* Add a tree cursor on the 'current' tree, if required. */
if( rc==LSM_OK && eTree==TREE_BOTH ){
rc = lsmTreeCursorNew(db, 0, &pCsr->apTreeCsr[0]);
}
return rc;
}
static int multiCursorAddRhs(MultiCursor *pCsr, Level *pLvl){
int i;
int nRhs = pLvl->nRight;
assert( pLvl->nRight>0 );
assert( pCsr->aPtr==0 );
pCsr->aPtr = lsmMallocZero(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nRhs);
if( !pCsr->aPtr ) return LSM_NOMEM_BKPT;
pCsr->nPtr = nRhs;
for(i=0; i<nRhs; i++){
pCsr->aPtr[i].pSeg = &pLvl->aRhs[i];
pCsr->aPtr[i].pLevel = pLvl;
}
return LSM_OK;
}
static void multiCursorAddOne(MultiCursor *pCsr, Level *pLvl, int *pRc){
if( *pRc==LSM_OK ){
int iPtr = pCsr->nPtr;
int i;
pCsr->aPtr[iPtr].pLevel = pLvl;
pCsr->aPtr[iPtr].pSeg = &pLvl->lhs;
iPtr++;
for(i=0; i<pLvl->nRight; i++){
pCsr->aPtr[iPtr].pLevel = pLvl;
pCsr->aPtr[iPtr].pSeg = &pLvl->aRhs[i];
iPtr++;
}
if( pLvl->nRight && pLvl->pSplitKey==0 ){
sortedSplitkey(pCsr->pDb, pLvl, pRc);
}
pCsr->nPtr = iPtr;
}
}
static int multiCursorAddAll(MultiCursor *pCsr, Snapshot *pSnap){
Level *pLvl;
int nPtr = 0;
int rc = LSM_OK;
for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){
/* If the LEVEL_INCOMPLETE flag is set, then this function is being
** called (indirectly) from within a sortedNewToplevel() call to
** construct pLvl. In this case ignore pLvl - this cursor is going to
** be used to retrieve a freelist entry from the LSM, and the partially
** complete level may confuse it. */
if( pLvl->flags & LEVEL_INCOMPLETE ) continue;
nPtr += (1 + pLvl->nRight);
}
assert( pCsr->aPtr==0 );
pCsr->aPtr = lsmMallocZeroRc(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nPtr, &rc);
for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){
if( (pLvl->flags & LEVEL_INCOMPLETE)==0 ){
multiCursorAddOne(pCsr, pLvl, &rc);
}
}
return rc;
}
static int multiCursorInit(MultiCursor *pCsr, Snapshot *pSnap){
int rc;
rc = multiCursorAddAll(pCsr, pSnap);
if( rc==LSM_OK ){
rc = multiCursorAddTree(pCsr, pSnap, TREE_BOTH);
}
pCsr->flags |= (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE);
return rc;
}
static MultiCursor *multiCursorNew(lsm_db *db, int *pRc){
MultiCursor *pCsr;
pCsr = (MultiCursor *)lsmMallocZeroRc(db->pEnv, sizeof(MultiCursor), pRc);
if( pCsr ){
pCsr->pNext = db->pCsr;
db->pCsr = pCsr;
pCsr->pDb = db;
}
return pCsr;
}
void lsmSortedRemap(lsm_db *pDb){
MultiCursor *pCsr;
for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
int iPtr;
if( pCsr->pBtCsr ){
btreeCursorLoadKey(pCsr->pBtCsr);
}
for(iPtr=0; iPtr<pCsr->nPtr; iPtr++){
segmentPtrLoadCell(&pCsr->aPtr[iPtr], pCsr->aPtr[iPtr].iCell);
}
}
}
static void multiCursorReadSeparators(MultiCursor *pCsr){
if( pCsr->nPtr>0 ){
pCsr->flags |= CURSOR_READ_SEPARATORS;
}
}
/*
** Have this cursor skip over SORTED_DELETE entries.
*/
static void multiCursorIgnoreDelete(MultiCursor *pCsr){
if( pCsr ) pCsr->flags |= CURSOR_IGNORE_DELETE;
}
/*
** If the free-block list is not empty, then have this cursor visit a key
** with (a) the system bit set, and (b) the key "FREELIST" and (c) a value
** blob containing the serialized free-block list.
*/
static int multiCursorVisitFreelist(MultiCursor *pCsr){
int rc = LSM_OK;
pCsr->flags |= CURSOR_FLUSH_FREELIST;
pCsr->pSystemVal = lsmMallocRc(pCsr->pDb->pEnv, 4 + 8, &rc);
return rc;
}
/*
** Allocate and return a new database cursor.
**
** This method should only be called to allocate user cursors. As it may
** recycle a cursor from lsm_db.pCsrCache.
*/
int lsmMCursorNew(
lsm_db *pDb, /* Database handle */
MultiCursor **ppCsr /* OUT: Allocated cursor */
){
MultiCursor *pCsr = 0;
int rc = LSM_OK;
if( pDb->pCsrCache ){
int bOld; /* True if there is an old in-memory tree */
/* Remove a cursor from the pCsrCache list and add it to the open list. */
pCsr = pDb->pCsrCache;
pDb->pCsrCache = pCsr->pNext;
pCsr->pNext = pDb->pCsr;
pDb->pCsr = pCsr;
/* The cursor can almost be used as is, except that the old in-memory
** tree cursor may be present and not required, or required and not
** present. Fix this if required. */
bOld = (lsmTreeHasOld(pDb) && pDb->treehdr.iOldLog!=pDb->pClient->iLogOff);
if( !bOld && pCsr->apTreeCsr[1] ){
lsmTreeCursorDestroy(pCsr->apTreeCsr[1]);
pCsr->apTreeCsr[1] = 0;
}else if( bOld && !pCsr->apTreeCsr[1] ){
rc = lsmTreeCursorNew(pDb, 1, &pCsr->apTreeCsr[1]);
}
pCsr->flags = (CURSOR_IGNORE_SYSTEM | CURSOR_IGNORE_DELETE);
}else{
pCsr = multiCursorNew(pDb, &rc);
if( rc==LSM_OK ) rc = multiCursorInit(pCsr, pDb->pClient);
}
if( rc!=LSM_OK ){
lsmMCursorClose(pCsr, 0);
pCsr = 0;
}
assert( (rc==LSM_OK)==(pCsr!=0) );
*ppCsr = pCsr;
return rc;
}
static int multiCursorGetVal(
MultiCursor *pCsr,
int iVal,
void **ppVal,
int *pnVal
){
int rc = LSM_OK;
*ppVal = 0;
*pnVal = 0;
switch( iVal ){
case CURSOR_DATA_TREE0:
case CURSOR_DATA_TREE1: {
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iVal-CURSOR_DATA_TREE0];
if( lsmTreeCursorValid(pTreeCsr) ){
lsmTreeCursorValue(pTreeCsr, ppVal, pnVal);
}else{
*ppVal = 0;
*pnVal = 0;
}
break;
}
case CURSOR_DATA_SYSTEM: {
Snapshot *pWorker = pCsr->pDb->pWorker;
if( pWorker
&& (pCsr->iFree % 2)==0
&& pCsr->iFree < (pWorker->freelist.nEntry*2)
){
int iEntry = pWorker->freelist.nEntry - 1 - (pCsr->iFree / 2);
u8 *aVal = &((u8 *)(pCsr->pSystemVal))[4];
lsmPutU64(aVal, pWorker->freelist.aEntry[iEntry].iId);
*ppVal = aVal;
*pnVal = 8;
}
break;
}
default: {
int iPtr = iVal-CURSOR_DATA_SEGMENT;
if( iPtr<pCsr->nPtr ){
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
if( pPtr->pPg ){
*ppVal = pPtr->pVal;
*pnVal = pPtr->nVal;
}
}
}
}
assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
return rc;
}
static int multiCursorAdvance(MultiCursor *pCsr, int bReverse);
/*
** This function is called by worker connections to walk the part of the
** free-list stored within the LSM data structure.
*/
int lsmSortedWalkFreelist(
lsm_db *pDb, /* Database handle */
int bReverse, /* True to iterate from largest to smallest */
int (*x)(void *, int, i64), /* Callback function */
void *pCtx /* First argument to pass to callback */
){
MultiCursor *pCsr; /* Cursor used to read db */
int rc = LSM_OK; /* Return Code */
Snapshot *pSnap = 0;
assert( pDb->pWorker );
if( pDb->bIncrMerge ){
rc = lsmCheckpointDeserialize(pDb, 0, pDb->pShmhdr->aSnap1, &pSnap);
if( rc!=LSM_OK ) return rc;
}else{
pSnap = pDb->pWorker;
}
pCsr = multiCursorNew(pDb, &rc);
if( pCsr ){
rc = multiCursorAddAll(pCsr, pSnap);
pCsr->flags |= CURSOR_IGNORE_DELETE;
}
if( rc==LSM_OK ){
if( bReverse==0 ){
rc = lsmMCursorLast(pCsr);
}else{
rc = lsmMCursorSeek(pCsr, 1, "", 0, LSM_SEEK_GE);
}
while( rc==LSM_OK && lsmMCursorValid(pCsr) && rtIsSystem(pCsr->eType) ){
void *pKey; int nKey;
void *pVal = 0; int nVal = 0;
rc = lsmMCursorKey(pCsr, &pKey, &nKey);
if( rc==LSM_OK ) rc = lsmMCursorValue(pCsr, &pVal, &nVal);
if( rc==LSM_OK && (nKey!=4 || nVal!=8) ) rc = LSM_CORRUPT_BKPT;
if( rc==LSM_OK ){
int iBlk;
i64 iSnap;
iBlk = (int)(~(lsmGetU32((u8 *)pKey)));
iSnap = (i64)lsmGetU64((u8 *)pVal);
if( x(pCtx, iBlk, iSnap) ) break;
rc = multiCursorAdvance(pCsr, !bReverse);
}
}
}
lsmMCursorClose(pCsr, 0);
if( pSnap!=pDb->pWorker ){
lsmFreeSnapshot(pDb->pEnv, pSnap);
}
return rc;
}
int lsmSortedLoadFreelist(
lsm_db *pDb, /* Database handle (must be worker) */
void **ppVal, /* OUT: Blob containing LSM free-list */
int *pnVal /* OUT: Size of *ppVal blob in bytes */
){
MultiCursor *pCsr; /* Cursor used to retreive free-list */
int rc = LSM_OK; /* Return Code */
assert( pDb->pWorker );
assert( *ppVal==0 && *pnVal==0 );
pCsr = multiCursorNew(pDb, &rc);
if( pCsr ){
rc = multiCursorAddAll(pCsr, pDb->pWorker);
pCsr->flags |= CURSOR_IGNORE_DELETE;
}
if( rc==LSM_OK ){
rc = lsmMCursorLast(pCsr);
if( rc==LSM_OK
&& rtIsWrite(pCsr->eType) && rtIsSystem(pCsr->eType)
&& pCsr->key.nData==8
&& 0==memcmp(pCsr->key.pData, "FREELIST", 8)
){
void *pVal; int nVal; /* Value read from database */
rc = lsmMCursorValue(pCsr, &pVal, &nVal);
if( rc==LSM_OK ){
*ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc);
if( *ppVal ){
memcpy(*ppVal, pVal, nVal);
*pnVal = nVal;
}
}
}
lsmMCursorClose(pCsr, 0);
}
return rc;
}
static int multiCursorAllocTree(MultiCursor *pCsr){
int rc = LSM_OK;
if( pCsr->aTree==0 ){
int nByte; /* Bytes of space to allocate */
int nMin; /* Total number of cursors being merged */
nMin = CURSOR_DATA_SEGMENT + pCsr->nPtr + (pCsr->pBtCsr!=0);
pCsr->nTree = 2;
while( pCsr->nTree<nMin ){
pCsr->nTree = pCsr->nTree*2;
}
nByte = sizeof(int)*pCsr->nTree*2;
pCsr->aTree = (int *)lsmMallocZeroRc(pCsr->pDb->pEnv, nByte, &rc);
}
return rc;
}
static void multiCursorCacheKey(MultiCursor *pCsr, int *pRc){
if( *pRc==LSM_OK ){
void *pKey;
int nKey;
multiCursorGetKey(pCsr, pCsr->aTree[1], &pCsr->eType, &pKey, &nKey);
*pRc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->key, pKey, nKey);
}
}
#ifdef LSM_DEBUG_EXPENSIVE
static void assertCursorTree(MultiCursor *pCsr){
int bRev = !!(pCsr->flags & CURSOR_PREV_OK);
int *aSave = pCsr->aTree;
int nSave = pCsr->nTree;
int rc;
pCsr->aTree = 0;
pCsr->nTree = 0;
rc = multiCursorAllocTree(pCsr);
if( rc==LSM_OK ){
int i;
for(i=pCsr->nTree-1; i>0; i--){
multiCursorDoCompare(pCsr, i, bRev);
}
assert( nSave==pCsr->nTree
&& 0==memcmp(aSave, pCsr->aTree, sizeof(int)*nSave)
);
lsmFree(pCsr->pDb->pEnv, pCsr->aTree);
}
pCsr->aTree = aSave;
pCsr->nTree = nSave;
}
#else
# define assertCursorTree(x)
#endif
static int mcursorLocationOk(MultiCursor *pCsr, int bDeleteOk){
int eType = pCsr->eType;
int iKey;
int i;
int rdmask;
assert( pCsr->flags & (CURSOR_NEXT_OK|CURSOR_PREV_OK) );
assertCursorTree(pCsr);
rdmask = (pCsr->flags & CURSOR_NEXT_OK) ? LSM_END_DELETE : LSM_START_DELETE;
/* If the cursor does not currently point to an actual database key (i.e.
** it points to a delete key, or the start or end of a range-delete), and
** the CURSOR_IGNORE_DELETE flag is set, skip past this entry. */
if( (pCsr->flags & CURSOR_IGNORE_DELETE) && bDeleteOk==0 ){
if( (eType & LSM_INSERT)==0 ) return 0;
}
/* If the cursor points to a system key (free-list entry), and the
** CURSOR_IGNORE_SYSTEM flag is set, skip thie entry. */
if( (pCsr->flags & CURSOR_IGNORE_SYSTEM) && rtTopic(eType)!=0 ){
return 0;
}
#ifndef NDEBUG
/* This block fires assert() statements to check one of the assumptions
** in the comment below - that if the lhs sub-cursor of a level undergoing
** a merge is valid, then all the rhs sub-cursors must be at EOF.
**
** Also assert that all rhs sub-cursors are either at EOF or point to
** a key that is not less than the level split-key. */
for(i=0; i<pCsr->nPtr; i++){
SegmentPtr *pPtr = &pCsr->aPtr[i];
Level *pLvl = pPtr->pLevel;
if( pLvl->nRight && pPtr->pPg ){
if( pPtr->pSeg==&pLvl->lhs ){
int j;
for(j=0; j<pLvl->nRight; j++) assert( pPtr[j+1].pPg==0 );
}else{
int res = sortedKeyCompare(pCsr->pDb->xCmp,
rtTopic(pPtr->eType), pPtr->pKey, pPtr->nKey,
pLvl->iSplitTopic, pLvl->pSplitKey, pLvl->nSplitKey
);
assert( res>=0 );
}
}
}
#endif
/* Now check if this key has already been deleted by a range-delete. If
** so, skip past it.
**
** Assume, for the moment, that the tree contains no levels currently
** undergoing incremental merge, and that this cursor is iterating forwards
** through the database keys. The cursor currently points to a key in
** level L. This key has already been deleted if any of the sub-cursors
** that point to levels newer than L (or to the in-memory tree) point to
** a key greater than the current key with the LSM_END_DELETE flag set.
**
** Or, if the cursor is iterating backwards through data keys, if any
** such sub-cursor points to a key smaller than the current key with the
** LSM_START_DELETE flag set.
**
** Why it works with levels undergoing a merge too:
**
** When a cursor iterates forwards, the sub-cursors for the rhs of a
** level are only activated once the lhs reaches EOF. So when iterating
** forwards, the keys visited are the same as if the level was completely
** merged.
**
** If the cursor is iterating backwards, then the lhs sub-cursor is not
** initialized until the last of the rhs sub-cursors has reached EOF.
** Additionally, if the START_DELETE flag is set on the last entry (in
** reverse order - so the entry with the smallest key) of a rhs sub-cursor,
** then a pseudo-key equal to the levels split-key with the END_DELETE
** flag set is visited by the sub-cursor.
*/
iKey = pCsr->aTree[1];
for(i=0; i<iKey; i++){
int csrflags;
multiCursorGetKey(pCsr, i, &csrflags, 0, 0);
if( (rdmask & csrflags) ){
const int SD_ED = (LSM_START_DELETE|LSM_END_DELETE);
if( (csrflags & SD_ED)==SD_ED
|| (pCsr->flags & CURSOR_IGNORE_DELETE)==0
){
void *pKey; int nKey;
multiCursorGetKey(pCsr, i, 0, &pKey, &nKey);
if( 0==sortedKeyCompare(pCsr->pDb->xCmp,
rtTopic(eType), pCsr->key.pData, pCsr->key.nData,
rtTopic(csrflags), pKey, nKey
)){
continue;
}
}
return 0;
}
}
/* The current cursor position is one this cursor should visit. Return 1. */
return 1;
}
static int multiCursorSetupTree(MultiCursor *pCsr, int bRev){
int rc;
rc = multiCursorAllocTree(pCsr);
if( rc==LSM_OK ){
int i;
for(i=pCsr->nTree-1; i>0; i--){
multiCursorDoCompare(pCsr, i, bRev);
}
}
assertCursorTree(pCsr);
multiCursorCacheKey(pCsr, &rc);
if( rc==LSM_OK && mcursorLocationOk(pCsr, 0)==0 ){
rc = multiCursorAdvance(pCsr, bRev);
}
return rc;
}
static int multiCursorEnd(MultiCursor *pCsr, int bLast){
int rc = LSM_OK;
int i;
pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
pCsr->flags |= (bLast ? CURSOR_PREV_OK : CURSOR_NEXT_OK);
pCsr->iFree = 0;
/* Position the two in-memory tree cursors */
for(i=0; rc==LSM_OK && i<2; i++){
if( pCsr->apTreeCsr[i] ){
rc = lsmTreeCursorEnd(pCsr->apTreeCsr[i], bLast);
}
}
for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
SegmentPtr *pPtr = &pCsr->aPtr[i];
Level *pLvl = pPtr->pLevel;
int iRhs;
int bHit = 0;
if( bLast ){
for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){
rc = segmentPtrEnd(pCsr, &pPtr[iRhs+1], 1);
if( pPtr[iRhs+1].pPg ) bHit = 1;
}
if( bHit==0 && rc==LSM_OK ){
rc = segmentPtrEnd(pCsr, pPtr, 1);
}else{
segmentPtrReset(pPtr, LSM_SEGMENTPTR_FREE_THRESHOLD);
}
}else{
int bLhs = (pPtr->pSeg==&pLvl->lhs);
assert( pPtr->pSeg==&pLvl->lhs || pPtr->pSeg==&pLvl->aRhs[0] );
if( bLhs ){
rc = segmentPtrEnd(pCsr, pPtr, 0);
if( pPtr->pKey ) bHit = 1;
}
for(iRhs=0; iRhs<pLvl->nRight && rc==LSM_OK; iRhs++){
if( bHit ){
segmentPtrReset(&pPtr[iRhs+1], LSM_SEGMENTPTR_FREE_THRESHOLD);
}else{
rc = sortedRhsFirst(pCsr, pLvl, &pPtr[iRhs+bLhs]);
}
}
}
i += pLvl->nRight;
}
/* And the b-tree cursor, if applicable */
if( rc==LSM_OK && pCsr->pBtCsr ){
assert( bLast==0 );
rc = btreeCursorFirst(pCsr->pBtCsr);
}
if( rc==LSM_OK ){
rc = multiCursorSetupTree(pCsr, bLast);
}
return rc;
}
int mcursorSave(MultiCursor *pCsr){
int rc = LSM_OK;
if( pCsr->aTree ){
int iTree = pCsr->aTree[1];
if( iTree==CURSOR_DATA_TREE0 || iTree==CURSOR_DATA_TREE1 ){
multiCursorCacheKey(pCsr, &rc);
}
}
mcursorFreeComponents(pCsr);
return rc;
}
int mcursorRestore(lsm_db *pDb, MultiCursor *pCsr){
int rc;
rc = multiCursorInit(pCsr, pDb->pClient);
if( rc==LSM_OK && pCsr->key.pData ){
rc = lsmMCursorSeek(pCsr,
rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, +1
);
}
return rc;
}
int lsmSaveCursors(lsm_db *pDb){
int rc = LSM_OK;
MultiCursor *pCsr;
for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){
rc = mcursorSave(pCsr);
}
return rc;
}
int lsmRestoreCursors(lsm_db *pDb){
int rc = LSM_OK;
MultiCursor *pCsr;
for(pCsr=pDb->pCsr; rc==LSM_OK && pCsr; pCsr=pCsr->pNext){
rc = mcursorRestore(pDb, pCsr);
}
return rc;
}
int lsmMCursorFirst(MultiCursor *pCsr){
return multiCursorEnd(pCsr, 0);
}
int lsmMCursorLast(MultiCursor *pCsr){
return multiCursorEnd(pCsr, 1);
}
lsm_db *lsmMCursorDb(MultiCursor *pCsr){
return pCsr->pDb;
}
void lsmMCursorReset(MultiCursor *pCsr){
int i;
lsmTreeCursorReset(pCsr->apTreeCsr[0]);
lsmTreeCursorReset(pCsr->apTreeCsr[1]);
for(i=0; i<pCsr->nPtr; i++){
segmentPtrReset(&pCsr->aPtr[i], LSM_SEGMENTPTR_FREE_THRESHOLD);
}
pCsr->key.nData = 0;
}
static int treeCursorSeek(
MultiCursor *pCsr,
TreeCursor *pTreeCsr,
void *pKey, int nKey,
int eSeek,
int *pbStop
){
int rc = LSM_OK;
if( pTreeCsr ){
int res = 0;
lsmTreeCursorSeek(pTreeCsr, pKey, nKey, &res);
switch( eSeek ){
case LSM_SEEK_EQ: {
int eType = lsmTreeCursorFlags(pTreeCsr);
if( (res<0 && (eType & LSM_START_DELETE))
|| (res>0 && (eType & LSM_END_DELETE))
|| (res==0 && (eType & LSM_POINT_DELETE))
){
*pbStop = 1;
}else if( res==0 && (eType & LSM_INSERT) ){
lsm_env *pEnv = pCsr->pDb->pEnv;
void *p; int n; /* Key/value from tree-cursor */
*pbStop = 1;
pCsr->flags |= CURSOR_SEEK_EQ;
rc = lsmTreeCursorKey(pTreeCsr, &pCsr->eType, &p, &n);
if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->key, p, n);
if( rc==LSM_OK ) rc = lsmTreeCursorValue(pTreeCsr, &p, &n);
if( rc==LSM_OK ) rc = sortedBlobSet(pEnv, &pCsr->val, p, n);
}
lsmTreeCursorReset(pTreeCsr);
break;
}
case LSM_SEEK_GE:
if( res<0 && lsmTreeCursorValid(pTreeCsr) ){
lsmTreeCursorNext(pTreeCsr);
}
break;
default:
if( res>0 ){
assert( lsmTreeCursorValid(pTreeCsr) );
lsmTreeCursorPrev(pTreeCsr);
}
break;
}
}
return rc;
}
/*
** Seek the cursor.
*/
int lsmMCursorSeek(
MultiCursor *pCsr,
int iTopic,
void *pKey, int nKey,
int eSeek
){
int eESeek = eSeek; /* Effective eSeek parameter */
int bStop = 0; /* Set to true to halt search operation */
int rc = LSM_OK; /* Return code */
int iPtr = 0; /* Used to iterate through pCsr->aPtr[] */
LsmPgno iPgno = 0; /* FC pointer value */
assert( pCsr->apTreeCsr[0]==0 || iTopic==0 );
assert( pCsr->apTreeCsr[1]==0 || iTopic==0 );
if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;
assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );
assert( (pCsr->flags & CURSOR_FLUSH_FREELIST)==0 );
assert( pCsr->nPtr==0 || pCsr->aPtr[0].pLevel );
pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK | CURSOR_SEEK_EQ);
rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[0], pKey, nKey, eESeek, &bStop);
if( rc==LSM_OK && bStop==0 ){
rc = treeCursorSeek(pCsr, pCsr->apTreeCsr[1], pKey, nKey, eESeek, &bStop);
}
/* Seek all segment pointers. */
for(iPtr=0; iPtr<pCsr->nPtr && rc==LSM_OK && bStop==0; iPtr++){
SegmentPtr *pPtr = &pCsr->aPtr[iPtr];
assert( pPtr->pSeg==&pPtr->pLevel->lhs );
rc = seekInLevel(pCsr, pPtr, eESeek, iTopic, pKey, nKey, &iPgno, &bStop);
iPtr += pPtr->pLevel->nRight;
}
if( eSeek!=LSM_SEEK_EQ ){
if( rc==LSM_OK ){
rc = multiCursorAllocTree(pCsr);
}
if( rc==LSM_OK ){
int i;
for(i=pCsr->nTree-1; i>0; i--){
multiCursorDoCompare(pCsr, i, eESeek==LSM_SEEK_LE);
}
if( eSeek==LSM_SEEK_GE ) pCsr->flags |= CURSOR_NEXT_OK;
if( eSeek==LSM_SEEK_LE ) pCsr->flags |= CURSOR_PREV_OK;
}
multiCursorCacheKey(pCsr, &rc);
if( rc==LSM_OK && eSeek!=LSM_SEEK_LEFAST && 0==mcursorLocationOk(pCsr, 0) ){
switch( eESeek ){
case LSM_SEEK_EQ:
lsmMCursorReset(pCsr);
break;
case LSM_SEEK_GE:
rc = lsmMCursorNext(pCsr);
break;
default:
rc = lsmMCursorPrev(pCsr);
break;
}
}
}
return rc;
}
int lsmMCursorValid(MultiCursor *pCsr){
int res = 0;
if( pCsr->flags & CURSOR_SEEK_EQ ){
res = 1;
}else if( pCsr->aTree ){
int iKey = pCsr->aTree[1];
if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
res = lsmTreeCursorValid(pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0]);
}else{
void *pKey;
multiCursorGetKey(pCsr, iKey, 0, &pKey, 0);
res = pKey!=0;
}
}
return res;
}
static int mcursorAdvanceOk(
MultiCursor *pCsr,
int bReverse,
int *pRc
){
void *pNew; /* Pointer to buffer containing new key */
int nNew; /* Size of buffer pNew in bytes */
int eNewType; /* Type of new record */
if( *pRc ) return 1;
/* Check the current key value. If it is not greater than (if bReverse==0)
** or less than (if bReverse!=0) the key currently cached in pCsr->key,
** then the cursor has not yet been successfully advanced.
*/
multiCursorGetKey(pCsr, pCsr->aTree[1], &eNewType, &pNew, &nNew);
if( pNew ){
int typemask = (pCsr->flags & CURSOR_IGNORE_DELETE) ? ~(0) : LSM_SYSTEMKEY;
int res = sortedDbKeyCompare(pCsr,
eNewType & typemask, pNew, nNew,
pCsr->eType & typemask, pCsr->key.pData, pCsr->key.nData
);
if( (bReverse==0 && res<=0) || (bReverse!=0 && res>=0) ){
return 0;
}
multiCursorCacheKey(pCsr, pRc);
assert( pCsr->eType==eNewType );
/* If this cursor is configured to skip deleted keys, and the current
** cursor points to a SORTED_DELETE entry, then the cursor has not been
** successfully advanced.
**
** Similarly, if the cursor is configured to skip system keys and the
** current cursor points to a system key, it has not yet been advanced.
*/
if( *pRc==LSM_OK && 0==mcursorLocationOk(pCsr, 0) ) return 0;
}
return 1;
}
static void flCsrAdvance(MultiCursor *pCsr){
assert( pCsr->flags & CURSOR_FLUSH_FREELIST );
if( pCsr->iFree % 2 ){
pCsr->iFree++;
}else{
int nEntry = pCsr->pDb->pWorker->freelist.nEntry;
FreelistEntry *aEntry = pCsr->pDb->pWorker->freelist.aEntry;
int i = nEntry - 1 - (pCsr->iFree / 2);
/* If the current entry is a delete and the "end-delete" key will not
** be attached to the next entry, increment iFree by 1 only. */
if( aEntry[i].iId<0 ){
while( 1 ){
if( i==0 || aEntry[i-1].iBlk!=aEntry[i].iBlk-1 ){
pCsr->iFree--;
break;
}
if( aEntry[i-1].iId>=0 ) break;
pCsr->iFree += 2;
i--;
}
}
pCsr->iFree += 2;
}
}
static int multiCursorAdvance(MultiCursor *pCsr, int bReverse){
int rc = LSM_OK; /* Return Code */
if( lsmMCursorValid(pCsr) ){
do {
int iKey = pCsr->aTree[1];
assertCursorTree(pCsr);
/* If this multi-cursor is advancing forwards, and the sub-cursor
** being advanced is the one that separator keys may be being read
** from, record the current absolute pointer value. */
if( pCsr->pPrevMergePtr ){
if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
assert( pCsr->pBtCsr );
*pCsr->pPrevMergePtr = pCsr->pBtCsr->iPtr;
}else if( pCsr->pBtCsr==0 && pCsr->nPtr>0
&& iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr-1)
){
SegmentPtr *pPtr = &pCsr->aPtr[iKey-CURSOR_DATA_SEGMENT];
*pCsr->pPrevMergePtr = pPtr->iPtr+pPtr->iPgPtr;
}
}
if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
if( bReverse ){
rc = lsmTreeCursorPrev(pTreeCsr);
}else{
rc = lsmTreeCursorNext(pTreeCsr);
}
}else if( iKey==CURSOR_DATA_SYSTEM ){
assert( pCsr->flags & CURSOR_FLUSH_FREELIST );
assert( bReverse==0 );
flCsrAdvance(pCsr);
}else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nPtr) ){
assert( bReverse==0 && pCsr->pBtCsr );
rc = btreeCursorNext(pCsr->pBtCsr);
}else{
rc = segmentCursorAdvance(pCsr, iKey-CURSOR_DATA_SEGMENT, bReverse);
}
if( rc==LSM_OK ){
int i;
for(i=(iKey+pCsr->nTree)/2; i>0; i=i/2){
multiCursorDoCompare(pCsr, i, bReverse);
}
assertCursorTree(pCsr);
}
}while( mcursorAdvanceOk(pCsr, bReverse, &rc)==0 );
}
return rc;
}
int lsmMCursorNext(MultiCursor *pCsr){
if( (pCsr->flags & CURSOR_NEXT_OK)==0 ) return LSM_MISUSE_BKPT;
return multiCursorAdvance(pCsr, 0);
}
int lsmMCursorPrev(MultiCursor *pCsr){
if( (pCsr->flags & CURSOR_PREV_OK)==0 ) return LSM_MISUSE_BKPT;
return multiCursorAdvance(pCsr, 1);
}
int lsmMCursorKey(MultiCursor *pCsr, void **ppKey, int *pnKey){
if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){
*pnKey = pCsr->key.nData;
*ppKey = pCsr->key.pData;
}else{
int iKey = pCsr->aTree[1];
if( iKey==CURSOR_DATA_TREE0 || iKey==CURSOR_DATA_TREE1 ){
TreeCursor *pTreeCsr = pCsr->apTreeCsr[iKey-CURSOR_DATA_TREE0];
lsmTreeCursorKey(pTreeCsr, 0, ppKey, pnKey);
}else{
int nKey;
#ifndef NDEBUG
void *pKey;
int eType;
multiCursorGetKey(pCsr, iKey, &eType, &pKey, &nKey);
assert( eType==pCsr->eType );
assert( nKey==pCsr->key.nData );
assert( memcmp(pKey, pCsr->key.pData, nKey)==0 );
#endif
nKey = pCsr->key.nData;
if( nKey==0 ){
*ppKey = 0;
}else{
*ppKey = pCsr->key.pData;
}
*pnKey = nKey;
}
}
return LSM_OK;
}
/*
** Compare the current key that cursor csr points to with pKey/nKey. Set
** *piRes to the result and return LSM_OK.
*/
int lsm_csr_cmp(lsm_cursor *csr, const void *pKey, int nKey, int *piRes){
MultiCursor *pCsr = (MultiCursor *)csr;
void *pCsrkey; int nCsrkey;
int rc;
rc = lsmMCursorKey(pCsr, &pCsrkey, &nCsrkey);
if( rc==LSM_OK ){
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
*piRes = sortedKeyCompare(xCmp, 0, pCsrkey, nCsrkey, 0, (void *)pKey, nKey);
}
return rc;
}
int lsmMCursorValue(MultiCursor *pCsr, void **ppVal, int *pnVal){
void *pVal;
int nVal;
int rc;
if( (pCsr->flags & CURSOR_SEEK_EQ) || pCsr->aTree==0 ){
rc = LSM_OK;
nVal = pCsr->val.nData;
pVal = pCsr->val.pData;
}else{
assert( pCsr->aTree );
assert( mcursorLocationOk(pCsr, (pCsr->flags & CURSOR_IGNORE_DELETE)) );
rc = multiCursorGetVal(pCsr, pCsr->aTree[1], &pVal, &nVal);
if( pVal && rc==LSM_OK ){
rc = sortedBlobSet(pCsr->pDb->pEnv, &pCsr->val, pVal, nVal);
pVal = pCsr->val.pData;
}
if( rc!=LSM_OK ){
pVal = 0;
nVal = 0;
}
}
*ppVal = pVal;
*pnVal = nVal;
return rc;
}
int lsmMCursorType(MultiCursor *pCsr, int *peType){
assert( pCsr->aTree );
multiCursorGetKey(pCsr, pCsr->aTree[1], peType, 0, 0);
return LSM_OK;
}
/*
** Buffer aData[], size nData, is assumed to contain a valid b-tree
** hierarchy page image. Return the offset in aData[] of the next free
** byte in the data area (where a new cell may be written if there is
** space).
*/
static int mergeWorkerPageOffset(u8 *aData, int nData){
int nRec;
int iOff;
int nKey;
int eType;
i64 nDummy;
nRec = lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]);
iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec-1)]);
eType = aData[iOff++];
assert( eType==0
|| eType==(LSM_SYSTEMKEY|LSM_SEPARATOR)
|| eType==(LSM_SEPARATOR)
);
iOff += lsmVarintGet64(&aData[iOff], &nDummy);
iOff += lsmVarintGet32(&aData[iOff], &nKey);
return iOff + (eType ? nKey : 0);
}
/*
** Following a checkpoint operation, database pages that are part of the
** checkpointed state of the LSM are deemed read-only. This includes the
** right-most page of the b-tree hierarchy of any separators array under
** construction, and all pages between it and the b-tree root, inclusive.
** This is a problem, as when further pages are appended to the separators
** array, entries must be added to the indicated b-tree hierarchy pages.
**
** This function copies all such b-tree pages to new locations, so that
** they can be modified as required.
**
** The complication is that not all database pages are the same size - due
** to the way the file.c module works some (the first and last in each block)
** are 4 bytes smaller than the others.
*/
static int mergeWorkerMoveHierarchy(
MergeWorker *pMW, /* Merge worker */
int bSep /* True for separators run */
){
lsm_db *pDb = pMW->pDb; /* Database handle */
int rc = LSM_OK; /* Return code */
int i;
Page **apHier = pMW->hier.apHier;
int nHier = pMW->hier.nHier;
for(i=0; rc==LSM_OK && i<nHier; i++){
Page *pNew = 0;
rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &pNew);
assert( rc==LSM_OK );
if( rc==LSM_OK ){
u8 *a1; int n1;
u8 *a2; int n2;
a1 = fsPageData(pNew, &n1);
a2 = fsPageData(apHier[i], &n2);
assert( n1==n2 || n1+4==n2 );
if( n1==n2 ){
memcpy(a1, a2, n2);
}else{
int nEntry = pageGetNRec(a2, n2);
int iEof1 = SEGMENT_EOF(n1, nEntry);
int iEof2 = SEGMENT_EOF(n2, nEntry);
memcpy(a1, a2, iEof2 - 4);
memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
}
lsmFsPageRelease(apHier[i]);
apHier[i] = pNew;
#if 0
assert( n1==n2 || n1+4==n2 || n2+4==n1 );
if( n1>=n2 ){
/* If n1 (size of the new page) is equal to or greater than n2 (the
** size of the old page), then copy the data into the new page. If
** n1==n2, this could be done with a single memcpy(). However,
** since sometimes n1>n2, the page content and footer must be copied
** separately. */
int nEntry = pageGetNRec(a2, n2);
int iEof1 = SEGMENT_EOF(n1, nEntry);
int iEof2 = SEGMENT_EOF(n2, nEntry);
memcpy(a1, a2, iEof2);
memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2);
lsmFsPageRelease(apHier[i]);
apHier[i] = pNew;
}else{
lsmPutU16(&a1[SEGMENT_FLAGS_OFFSET(n1)], SEGMENT_BTREE_FLAG);
lsmPutU16(&a1[SEGMENT_NRECORD_OFFSET(n1)], 0);
lsmPutU64(&a1[SEGMENT_POINTER_OFFSET(n1)], 0);
i = i - 1;
lsmFsPageRelease(pNew);
}
#endif
}
}
#ifdef LSM_DEBUG
if( rc==LSM_OK ){
for(i=0; i<nHier; i++) assert( lsmFsPageWritable(apHier[i]) );
}
#endif
return rc;
}
/*
** Allocate and populate the MergeWorker.apHier[] array.
*/
static int mergeWorkerLoadHierarchy(MergeWorker *pMW){
int rc = LSM_OK;
Segment *pSeg;
Hierarchy *p;
pSeg = &pMW->pLevel->lhs;
p = &pMW->hier;
if( p->apHier==0 && pSeg->iRoot!=0 ){
FileSystem *pFS = pMW->pDb->pFS;
lsm_env *pEnv = pMW->pDb->pEnv;
Page **apHier = 0;
int nHier = 0;
LsmPgno iPg = pSeg->iRoot;
do {
Page *pPg = 0;
u8 *aData;
int nData;
int flags;
rc = lsmFsDbPageGet(pFS, pSeg, iPg, &pPg);
if( rc!=LSM_OK ) break;
aData = fsPageData(pPg, &nData);
flags = pageGetFlags(aData, nData);
if( flags&SEGMENT_BTREE_FLAG ){
Page **apNew = (Page **)lsmRealloc(
pEnv, apHier, sizeof(Page *)*(nHier+1)
);
if( apNew==0 ){
rc = LSM_NOMEM_BKPT;
break;
}
apHier = apNew;
memmove(&apHier[1], &apHier[0], sizeof(Page *) * nHier);
nHier++;
apHier[0] = pPg;
iPg = pageGetPtr(aData, nData);
}else{
lsmFsPageRelease(pPg);
break;
}
}while( 1 );
if( rc==LSM_OK ){
u8 *aData;
int nData;
aData = fsPageData(apHier[0], &nData);
pMW->aSave[0].iPgno = pageGetPtr(aData, nData);
p->nHier = nHier;
p->apHier = apHier;
rc = mergeWorkerMoveHierarchy(pMW, 0);
}else{
int i;
for(i=0; i<nHier; i++){
lsmFsPageRelease(apHier[i]);
}
lsmFree(pEnv, apHier);
}
}
return rc;
}
/*
** B-tree pages use almost the same format as regular pages. The
** differences are:
**
** 1. The record format is (usually, see below) as follows:
**
** + Type byte (always SORTED_SEPARATOR or SORTED_SYSTEM_SEPARATOR),
** + Absolute pointer value (varint),
** + Number of bytes in key (varint),
** + LsmBlob containing key data.
**
** 2. All pointer values are stored as absolute values (not offsets
** relative to the footer pointer value).
**
** 3. Each pointer that is part of a record points to a page that
** contains keys smaller than the records key (note: not "equal to or
** smaller than - smaller than").
**
** 4. The pointer in the page footer of a b-tree page points to a page
** that contains keys equal to or larger than the largest key on the
** b-tree page.
**
** The reason for having the page footer pointer point to the right-child
** (instead of the left) is that doing things this way makes the
** mergeWorkerMoveHierarchy() operation less complicated (since the pointers
** that need to be updated are all stored as fixed-size integers within the
** page footer, not varints in page records).
**
** Records may not span b-tree pages. If this function is called to add a
** record larger than (page-size / 4) bytes, then a pointer to the indexed
** array page that contains the main record is added to the b-tree instead.
** In this case the record format is:
**
** + 0x00 byte (1 byte)
** + Absolute pointer value (varint),
** + Absolute page number of page containing key (varint).
**
** See function seekInBtree() for the code that traverses b-tree pages.
*/
static int mergeWorkerBtreeWrite(
MergeWorker *pMW,
u8 eType,
LsmPgno iPtr,
LsmPgno iKeyPg,
void *pKey,
int nKey
){
Hierarchy *p = &pMW->hier;
lsm_db *pDb = pMW->pDb; /* Database handle */
int rc = LSM_OK; /* Return Code */
int iLevel; /* Level of b-tree hierachy to write to */
int nData; /* Size of aData[] in bytes */
u8 *aData; /* Page data for level iLevel */
int iOff; /* Offset on b-tree page to write record to */
int nRec; /* Initial number of records on b-tree page */
/* iKeyPg should be zero for an ordinary b-tree key, or non-zero for an
** indirect key. The flags byte for an indirect key is 0x00. */
assert( (eType==0)==(iKeyPg!=0) );
/* The MergeWorker.apHier[] array contains the right-most leaf of the b-tree
** hierarchy, the root node, and all nodes that lie on the path between.
** apHier[0] is the right-most leaf and apHier[pMW->nHier-1] is the current
** root page.
**
** This loop searches for a node with enough space to store the key on,
** starting with the leaf and iterating up towards the root. When the loop
** exits, the key may be written to apHier[iLevel]. */
for(iLevel=0; iLevel<=p->nHier; iLevel++){
int nByte; /* Number of free bytes required */
if( iLevel==p->nHier ){
/* Extend the array and allocate a new root page. */
Page **aNew;
aNew = (Page **)lsmRealloc(
pMW->pDb->pEnv, p->apHier, sizeof(Page *)*(p->nHier+1)
);
if( !aNew ){
return LSM_NOMEM_BKPT;
}
p->apHier = aNew;
}else{
Page *pOld;
int nFree;
/* If the key will fit on this page, break out of the loop here.
** The new entry will be written to page apHier[iLevel]. */
pOld = p->apHier[iLevel];
assert( lsmFsPageWritable(pOld) );
aData = fsPageData(pOld, &nData);
if( eType==0 ){
nByte = 2 + 1 + lsmVarintLen64(iPtr) + lsmVarintLen64(iKeyPg);
}else{
nByte = 2 + 1 + lsmVarintLen64(iPtr) + lsmVarintLen32(nKey) + nKey;
}
nRec = pageGetNRec(aData, nData);
nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData);
if( nByte<=nFree ) break;
/* Otherwise, this page is full. Set the right-hand-child pointer
** to iPtr and release it. */
lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
assert( lsmFsPageNumber(pOld)==0 );
rc = lsmFsPagePersist(pOld);
if( rc==LSM_OK ){
iPtr = lsmFsPageNumber(pOld);
lsmFsPageRelease(pOld);
}
}
/* Allocate a new page for apHier[iLevel]. */
p->apHier[iLevel] = 0;
if( rc==LSM_OK ){
rc = lsmFsSortedAppend(
pDb->pFS, pDb->pWorker, pMW->pLevel, 1, &p->apHier[iLevel]
);
}
if( rc!=LSM_OK ) return rc;
aData = fsPageData(p->apHier[iLevel], &nData);
memset(aData, 0, nData);
lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG);
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
if( iLevel==p->nHier ){
p->nHier++;
break;
}
}
/* Write the key into page apHier[iLevel]. */
aData = fsPageData(p->apHier[iLevel], &nData);
iOff = mergeWorkerPageOffset(aData, nData);
nRec = pageGetNRec(aData, nData);
lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff);
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1));
if( eType==0 ){
aData[iOff++] = 0x00;
iOff += lsmVarintPut64(&aData[iOff], iPtr);
iOff += lsmVarintPut64(&aData[iOff], iKeyPg);
}else{
aData[iOff++] = eType;
iOff += lsmVarintPut64(&aData[iOff], iPtr);
iOff += lsmVarintPut32(&aData[iOff], nKey);
memcpy(&aData[iOff], pKey, nKey);
}
return rc;
}
static int mergeWorkerBtreeIndirect(MergeWorker *pMW){
int rc = LSM_OK;
if( pMW->iIndirect ){
LsmPgno iKeyPg = pMW->aSave[1].iPgno;
rc = mergeWorkerBtreeWrite(pMW, 0, pMW->iIndirect, iKeyPg, 0, 0);
pMW->iIndirect = 0;
}
return rc;
}
/*
** Append the database key (iTopic/pKey/nKey) to the b-tree under
** construction. This key has not yet been written to a segment page.
** The pointer that will accompany the new key in the b-tree - that
** points to the completed segment page that contains keys smaller than
** (pKey/nKey) is currently stored in pMW->aSave[0].iPgno.
*/
static int mergeWorkerPushHierarchy(
MergeWorker *pMW, /* Merge worker object */
int iTopic, /* Topic value for this key */
void *pKey, /* Pointer to key buffer */
int nKey /* Size of pKey buffer in bytes */
){
int rc = LSM_OK; /* Return Code */
LsmPgno iPtr; /* Pointer value to accompany pKey/nKey */
assert( pMW->aSave[0].bStore==0 );
assert( pMW->aSave[1].bStore==0 );
rc = mergeWorkerBtreeIndirect(pMW);
/* Obtain the absolute pointer value to store along with the key in the
** page body. This pointer points to a page that contains keys that are
** smaller than pKey/nKey. */
iPtr = pMW->aSave[0].iPgno;
assert( iPtr!=0 );
/* Determine if the indirect format should be used. */
if( (nKey*4 > lsmFsPageSize(pMW->pDb->pFS)) ){
pMW->iIndirect = iPtr;
pMW->aSave[1].bStore = 1;
}else{
rc = mergeWorkerBtreeWrite(
pMW, (u8)(iTopic | LSM_SEPARATOR), iPtr, 0, pKey, nKey
);
}
/* Ensure that the SortedRun.iRoot field is correct. */
return rc;
}
static int mergeWorkerFinishHierarchy(
MergeWorker *pMW /* Merge worker object */
){
int i; /* Used to loop through apHier[] */
int rc = LSM_OK; /* Return code */
LsmPgno iPtr; /* New right-hand-child pointer value */
iPtr = pMW->aSave[0].iPgno;
for(i=0; i<pMW->hier.nHier && rc==LSM_OK; i++){
Page *pPg = pMW->hier.apHier[i];
int nData; /* Size of aData[] in bytes */
u8 *aData; /* Page data for pPg */
aData = fsPageData(pPg, &nData);
lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr);
rc = lsmFsPagePersist(pPg);
iPtr = lsmFsPageNumber(pPg);
lsmFsPageRelease(pPg);
}
if( pMW->hier.nHier ){
pMW->pLevel->lhs.iRoot = iPtr;
lsmFree(pMW->pDb->pEnv, pMW->hier.apHier);
pMW->hier.apHier = 0;
pMW->hier.nHier = 0;
}
return rc;
}
static int mergeWorkerAddPadding(
MergeWorker *pMW /* Merge worker object */
){
FileSystem *pFS = pMW->pDb->pFS;
return lsmFsSortedPadding(pFS, pMW->pDb->pWorker, &pMW->pLevel->lhs);
}
/*
** Release all page references currently held by the merge-worker passed
** as the only argument. Unless an error has occurred, all pages have
** already been released.
*/
static void mergeWorkerReleaseAll(MergeWorker *pMW){
int i;
lsmFsPageRelease(pMW->pPage);
pMW->pPage = 0;
for(i=0; i<pMW->hier.nHier; i++){
lsmFsPageRelease(pMW->hier.apHier[i]);
pMW->hier.apHier[i] = 0;
}
lsmFree(pMW->pDb->pEnv, pMW->hier.apHier);
pMW->hier.apHier = 0;
pMW->hier.nHier = 0;
}
static int keyszToSkip(FileSystem *pFS, int nKey){
int nPgsz; /* Nominal database page size */
nPgsz = lsmFsPageSize(pFS);
return LSM_MIN(((nKey * 4) / nPgsz), 3);
}
/*
** Release the reference to the current output page of merge-worker *pMW
** (reference pMW->pPage). Set the page number values in aSave[] as
** required (see comments above struct MergeWorker for details).
*/
static int mergeWorkerPersistAndRelease(MergeWorker *pMW){
int rc;
int i;
assert( pMW->pPage || (pMW->aSave[0].bStore==0 && pMW->aSave[1].bStore==0) );
/* Persist the page */
rc = lsmFsPagePersist(pMW->pPage);
/* If required, save the page number. */
for(i=0; i<2; i++){
if( pMW->aSave[i].bStore ){
pMW->aSave[i].iPgno = lsmFsPageNumber(pMW->pPage);
pMW->aSave[i].bStore = 0;
}
}
/* Release the completed output page. */
lsmFsPageRelease(pMW->pPage);
pMW->pPage = 0;
return rc;
}
/*
** Advance to the next page of an output run being populated by merge-worker
** pMW. The footer of the new page is initialized to indicate that it contains
** zero records. The flags field is cleared. The page footer pointer field
** is set to iFPtr.
**
** If successful, LSM_OK is returned. Otherwise, an error code.
*/
static int mergeWorkerNextPage(
MergeWorker *pMW, /* Merge worker object to append page to */
LsmPgno iFPtr /* Pointer value for footer of new page */
){
int rc = LSM_OK; /* Return code */
Page *pNext = 0; /* New page appended to run */
lsm_db *pDb = pMW->pDb; /* Database handle */
rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pMW->pLevel, 0, &pNext);
assert( rc || pMW->pLevel->lhs.iFirst>0 || pMW->pDb->compress.xCompress );
if( rc==LSM_OK ){
u8 *aData; /* Data buffer belonging to page pNext */
int nData; /* Size of aData[] in bytes */
rc = mergeWorkerPersistAndRelease(pMW);
pMW->pPage = pNext;
pMW->pLevel->pMerge->iOutputOff = 0;
aData = fsPageData(pNext, &nData);
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0);
lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], 0);
lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iFPtr);
pMW->nWork++;
}
return rc;
}
/*
** Write a blob of data into an output segment being populated by a
** merge-worker object. If argument bSep is true, write into the separators
** array. Otherwise, the main array.
**
** This function is used to write the blobs of data for keys and values.
*/
static int mergeWorkerData(
MergeWorker *pMW, /* Merge worker object */
int bSep, /* True to write to separators run */
LsmPgno iFPtr, /* Footer ptr for new pages */
u8 *aWrite, /* Write data from this buffer */
int nWrite /* Size of aWrite[] in bytes */
){
int rc = LSM_OK; /* Return code */
int nRem = nWrite; /* Number of bytes still to write */
while( rc==LSM_OK && nRem>0 ){
Merge *pMerge = pMW->pLevel->pMerge;
int nCopy; /* Number of bytes to copy */
u8 *aData; /* Pointer to buffer of current output page */
int nData; /* Size of aData[] in bytes */
int nRec; /* Number of records on current output page */
int iOff; /* Offset in aData[] to write to */
assert( lsmFsPageWritable(pMW->pPage) );
aData = fsPageData(pMW->pPage, &nData);
nRec = pageGetNRec(aData, nData);
iOff = pMerge->iOutputOff;
nCopy = LSM_MIN(nRem, SEGMENT_EOF(nData, nRec) - iOff);
memcpy(&aData[iOff], &aWrite[nWrite-nRem], nCopy);
nRem -= nCopy;
if( nRem>0 ){
rc = mergeWorkerNextPage(pMW, iFPtr);
}else{
pMerge->iOutputOff = iOff + nCopy;
}
}
return rc;
}
/*
** The MergeWorker passed as the only argument is working to merge two or
** more existing segments together (not to flush an in-memory tree). It
** has not yet written the first key to the first page of the output.
*/
static int mergeWorkerFirstPage(MergeWorker *pMW){
int rc = LSM_OK; /* Return code */
Page *pPg = 0; /* First page of run pSeg */
LsmPgno iFPtr = 0; /* Pointer value read from footer of pPg */
MultiCursor *pCsr = pMW->pCsr;
assert( pMW->pPage==0 );
if( pCsr->pBtCsr ){
rc = LSM_OK;
iFPtr = pMW->pLevel->pNext->lhs.iFirst;
}else if( pCsr->nPtr>0 ){
Segment *pSeg;
pSeg = pCsr->aPtr[pCsr->nPtr-1].pSeg;
rc = lsmFsDbPageGet(pMW->pDb->pFS, pSeg, pSeg->iFirst, &pPg);
if( rc==LSM_OK ){
u8 *aData; /* Buffer for page pPg */
int nData; /* Size of aData[] in bytes */
aData = fsPageData(pPg, &nData);
iFPtr = pageGetPtr(aData, nData);
lsmFsPageRelease(pPg);
}
}
if( rc==LSM_OK ){
rc = mergeWorkerNextPage(pMW, iFPtr);
if( pCsr->pPrevMergePtr ) *pCsr->pPrevMergePtr = iFPtr;
pMW->aSave[0].bStore = 1;
}
return rc;
}
static int mergeWorkerWrite(
MergeWorker *pMW, /* Merge worker object to write into */
int eType, /* One of SORTED_SEPARATOR, WRITE or DELETE */
void *pKey, int nKey, /* Key value */
void *pVal, int nVal, /* Value value */
LsmPgno iPtr /* Absolute value of page pointer, or 0 */
){
int rc = LSM_OK; /* Return code */
Merge *pMerge; /* Persistent part of level merge state */
int nHdr; /* Space required for this record header */
Page *pPg; /* Page to write to */
u8 *aData; /* Data buffer for page pWriter->pPage */
int nData = 0; /* Size of buffer aData[] in bytes */
int nRec = 0; /* Number of records on page pPg */
LsmPgno iFPtr = 0; /* Value of pointer in footer of pPg */
LsmPgno iRPtr = 0; /* Value of pointer written into record */
int iOff = 0; /* Current write offset within page pPg */
Segment *pSeg; /* Segment being written */
int flags = 0; /* If != 0, flags value for page footer */
int bFirst = 0; /* True for first key of output run */
pMerge = pMW->pLevel->pMerge;
pSeg = &pMW->pLevel->lhs;
if( pSeg->iFirst==0 && pMW->pPage==0 ){
rc = mergeWorkerFirstPage(pMW);
bFirst = 1;
}
pPg = pMW->pPage;
if( pPg ){
aData = fsPageData(pPg, &nData);
nRec = pageGetNRec(aData, nData);
iFPtr = pageGetPtr(aData, nData);
iRPtr = iPtr ? (iPtr - iFPtr) : 0;
}
/* Figure out how much space is required by the new record. The space
** required is divided into two sections: the header and the body. The
** header consists of the intial varint fields. The body are the blobs
** of data that correspond to the key and value data. The entire header
** must be stored on the page. The body may overflow onto the next and
** subsequent pages.
**
** The header space is:
**
** 1) record type - 1 byte.
** 2) Page-pointer-offset - 1 varint
** 3) Key size - 1 varint
** 4) Value size - 1 varint (only if LSM_INSERT flag is set)
*/
if( rc==LSM_OK ){
nHdr = 1 + lsmVarintLen64(iRPtr) + lsmVarintLen32(nKey);
if( rtIsWrite(eType) ) nHdr += lsmVarintLen32(nVal);
/* If the entire header will not fit on page pPg, or if page pPg is
** marked read-only, advance to the next page of the output run. */
iOff = pMerge->iOutputOff;
if( iOff<0 || pPg==0 || iOff+nHdr > SEGMENT_EOF(nData, nRec+1) ){
if( iOff>=0 && pPg ){
/* Zero any free space on the page */
assert( aData );
memset(&aData[iOff], 0, SEGMENT_EOF(nData, nRec)-iOff);
}
iFPtr = *pMW->pCsr->pPrevMergePtr;
iRPtr = iPtr ? (iPtr - iFPtr) : 0;
iOff = 0;
nRec = 0;
rc = mergeWorkerNextPage(pMW, iFPtr);
pPg = pMW->pPage;
}
}
/* If this record header will be the first on the page, and the page is
** not the very first in the entire run, add a copy of the key to the
** b-tree hierarchy.
*/
if( rc==LSM_OK && nRec==0 && bFirst==0 ){
assert( pMerge->nSkip>=0 );
if( pMerge->nSkip==0 ){
rc = mergeWorkerPushHierarchy(pMW, rtTopic(eType), pKey, nKey);
assert( pMW->aSave[0].bStore==0 );
pMW->aSave[0].bStore = 1;
pMerge->nSkip = keyszToSkip(pMW->pDb->pFS, nKey);
}else{
pMerge->nSkip--;
flags = PGFTR_SKIP_THIS_FLAG;
}
if( pMerge->nSkip ) flags |= PGFTR_SKIP_NEXT_FLAG;
}
/* Update the output segment */
if( rc==LSM_OK ){
aData = fsPageData(pPg, &nData);
/* Update the page footer. */
lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], (u16)(nRec+1));
lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], (u16)iOff);
if( flags ) lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], (u16)flags);
/* Write the entry header into the current page. */
aData[iOff++] = (u8)eType; /* 1 */
iOff += lsmVarintPut64(&aData[iOff], iRPtr); /* 2 */
iOff += lsmVarintPut32(&aData[iOff], nKey); /* 3 */
if( rtIsWrite(eType) ) iOff += lsmVarintPut32(&aData[iOff], nVal); /* 4 */
pMerge->iOutputOff = iOff;
/* Write the key and data into the segment. */
assert( iFPtr==pageGetPtr(aData, nData) );
rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pKey, nKey);
if( rc==LSM_OK && rtIsWrite(eType) ){
if( rc==LSM_OK ){
rc = mergeWorkerData(pMW, 0, iFPtr+iRPtr, pVal, nVal);
}
}
}
return rc;
}
/*
** Free all resources allocated by mergeWorkerInit().
*/
static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){
int i; /* Iterator variable */
int rc = *pRc;
MultiCursor *pCsr = pMW->pCsr;
/* Unless the merge has finished, save the cursor position in the
** Merge.aInput[] array. See function mergeWorkerInit() for the
** code to restore a cursor position based on aInput[]. */
if( rc==LSM_OK && pCsr ){
Merge *pMerge = pMW->pLevel->pMerge;
if( lsmMCursorValid(pCsr) ){
int bBtree = (pCsr->pBtCsr!=0);
int iPtr;
/* pMerge->nInput==0 indicates that this is a FlushTree() operation. */
assert( pMerge->nInput==0 || pMW->pLevel->nRight>0 );
assert( pMerge->nInput==0 || pMerge->nInput==(pCsr->nPtr+bBtree) );
for(i=0; i<(pMerge->nInput-bBtree); i++){
SegmentPtr *pPtr = &pCsr->aPtr[i];
if( pPtr->pPg ){
pMerge->aInput[i].iPg = lsmFsPageNumber(pPtr->pPg);
pMerge->aInput[i].iCell = pPtr->iCell;
}else{
pMerge->aInput[i].iPg = 0;
pMerge->aInput[i].iCell = 0;
}
}
if( bBtree && pMerge->nInput ){
assert( i==pCsr->nPtr );
btreeCursorPosition(pCsr->pBtCsr, &pMerge->aInput[i]);
}
/* Store the location of the split-key */
iPtr = pCsr->aTree[1] - CURSOR_DATA_SEGMENT;
if( iPtr<pCsr->nPtr ){
pMerge->splitkey = pMerge->aInput[iPtr];
}else{
btreeCursorSplitkey(pCsr->pBtCsr, &pMerge->splitkey);
}
}
/* Zero any free space left on the final page. This helps with
** compression if using a compression hook. And prevents valgrind
** from complaining about uninitialized byte passed to write(). */
if( pMW->pPage ){
int nData;
u8 *aData = fsPageData(pMW->pPage, &nData);
int iOff = pMerge->iOutputOff;
int iEof = SEGMENT_EOF(nData, pageGetNRec(aData, nData));
memset(&aData[iOff], 0, iEof - iOff);
}
pMerge->iOutputOff = -1;
}
lsmMCursorClose(pCsr, 0);
/* Persist and release the output page. */
if( rc==LSM_OK ) rc = mergeWorkerPersistAndRelease(pMW);
if( rc==LSM_OK ) rc = mergeWorkerBtreeIndirect(pMW);
if( rc==LSM_OK ) rc = mergeWorkerFinishHierarchy(pMW);
if( rc==LSM_OK ) rc = mergeWorkerAddPadding(pMW);
lsmFsFlushWaiting(pMW->pDb->pFS, &rc);
mergeWorkerReleaseAll(pMW);
lsmFree(pMW->pDb->pEnv, pMW->aGobble);
pMW->aGobble = 0;
pMW->pCsr = 0;
*pRc = rc;
}
/*
** The cursor passed as the first argument is being used as the input for
** a merge operation. When this function is called, *piFlags contains the
** database entry flags for the current entry. The entry about to be written
** to the output.
**
** Note that this function only has to work for cursors configured to
** iterate forwards (not backwards).
*/
static void mergeRangeDeletes(MultiCursor *pCsr, int *piVal, int *piFlags){
int f = *piFlags;
int iKey = pCsr->aTree[1];
int i;
assert( pCsr->flags & CURSOR_NEXT_OK );
if( pCsr->flags & CURSOR_IGNORE_DELETE ){
/* The ignore-delete flag is set when the output of the merge will form
** the oldest level in the database. In this case there is no point in
** retaining any range-delete flags. */
assert( (f & LSM_POINT_DELETE)==0 );
f &= ~(LSM_START_DELETE|LSM_END_DELETE);
}else{
for(i=0; i<(CURSOR_DATA_SEGMENT + pCsr->nPtr); i++){
if( i!=iKey ){
int eType;
void *pKey;
int nKey;
int res;
multiCursorGetKey(pCsr, i, &eType, &pKey, &nKey);
if( pKey ){
res = sortedKeyCompare(pCsr->pDb->xCmp,
rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData,
rtTopic(eType), pKey, nKey
);
assert( res<=0 );
if( res==0 ){
if( (f & (LSM_INSERT|LSM_POINT_DELETE))==0 ){
if( eType & LSM_INSERT ){
f |= LSM_INSERT;
*piVal = i;
}
else if( eType & LSM_POINT_DELETE ){
f |= LSM_POINT_DELETE;
}
}
f |= (eType & (LSM_END_DELETE|LSM_START_DELETE));
}
if( i>iKey && (eType & LSM_END_DELETE) && res<0 ){
if( f & (LSM_INSERT|LSM_POINT_DELETE) ){
f |= (LSM_END_DELETE|LSM_START_DELETE);
}else{
f = 0;
}
break;
}
}
}
}
assert( (f & LSM_INSERT)==0 || (f & LSM_POINT_DELETE)==0 );
if( (f & LSM_START_DELETE)
&& (f & LSM_END_DELETE)
&& (f & LSM_POINT_DELETE )
){
f = 0;
}
}
*piFlags = f;
}
static int mergeWorkerStep(MergeWorker *pMW){
lsm_db *pDb = pMW->pDb; /* Database handle */
MultiCursor *pCsr; /* Cursor to read input data from */
int rc = LSM_OK; /* Return code */
int eType; /* SORTED_SEPARATOR, WRITE or DELETE */
void *pKey; int nKey; /* Key */
LsmPgno iPtr;
int iVal;
pCsr = pMW->pCsr;
/* Pull the next record out of the source cursor. */
lsmMCursorKey(pCsr, &pKey, &nKey);
eType = pCsr->eType;
/* Figure out if the output record may have a different pointer value
** than the previous. This is the case if the current key is identical to
** a key that appears in the lowest level run being merged. If so, set
** iPtr to the absolute pointer value. If not, leave iPtr set to zero,
** indicating that the output pointer value should be a copy of the pointer
** value written with the previous key. */
iPtr = (pCsr->pPrevMergePtr ? *pCsr->pPrevMergePtr : 0);
if( pCsr->pBtCsr ){
BtreeCursor *pBtCsr = pCsr->pBtCsr;
if( pBtCsr->pKey ){
int res = rtTopic(pBtCsr->eType) - rtTopic(eType);
if( res==0 ) res = pDb->xCmp(pBtCsr->pKey, pBtCsr->nKey, pKey, nKey);
if( 0==res ) iPtr = pBtCsr->iPtr;
assert( res>=0 );
}
}else if( pCsr->nPtr ){
SegmentPtr *pPtr = &pCsr->aPtr[pCsr->nPtr-1];
if( pPtr->pPg
&& 0==pDb->xCmp(pPtr->pKey, pPtr->nKey, pKey, nKey)
){
iPtr = pPtr->iPtr+pPtr->iPgPtr;
}
}
iVal = pCsr->aTree[1];
mergeRangeDeletes(pCsr, &iVal, &eType);
if( eType!=0 ){
if( pMW->aGobble ){
int iGobble = pCsr->aTree[1] - CURSOR_DATA_SEGMENT;
if( iGobble<pCsr->nPtr && iGobble>=0 ){
SegmentPtr *pGobble = &pCsr->aPtr[iGobble];
if( (pGobble->flags & PGFTR_SKIP_THIS_FLAG)==0 ){
pMW->aGobble[iGobble] = lsmFsPageNumber(pGobble->pPg);
}
}
}
/* If this is a separator key and we know that the output pointer has not
** changed, there is no point in writing an output record. Otherwise,
** proceed. */
if( rc==LSM_OK && (rtIsSeparator(eType)==0 || iPtr!=0) ){
/* Write the record into the main run. */
void *pVal; int nVal;
rc = multiCursorGetVal(pCsr, iVal, &pVal, &nVal);
if( pVal && rc==LSM_OK ){
assert( nVal>=0 );
rc = sortedBlobSet(pDb->pEnv, &pCsr->val, pVal, nVal);
pVal = pCsr->val.pData;
}
if( rc==LSM_OK ){
rc = mergeWorkerWrite(pMW, eType, pKey, nKey, pVal, nVal, iPtr);
}
}
}
/* Advance the cursor to the next input record (assuming one exists). */
assert( lsmMCursorValid(pMW->pCsr) );
if( rc==LSM_OK ) rc = lsmMCursorNext(pMW->pCsr);
return rc;
}
static int mergeWorkerDone(MergeWorker *pMW){
return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
}
static void sortedFreeLevel(lsm_env *pEnv, Level *p){
if( p ){
lsmFree(pEnv, p->pSplitKey);
lsmFree(pEnv, p->pMerge);
lsmFree(pEnv, p->aRhs);
lsmFree(pEnv, p);
}
}
static void sortedInvokeWorkHook(lsm_db *pDb){
if( pDb->xWork ){
pDb->xWork(pDb, pDb->pWorkCtx);
}
}
static int sortedNewToplevel(
lsm_db *pDb, /* Connection handle */
int eTree, /* One of the TREE_XXX constants */
int *pnWrite /* OUT: Number of database pages written */
){
int rc = LSM_OK; /* Return Code */
MultiCursor *pCsr = 0;
Level *pNext = 0; /* The current top level */
Level *pNew; /* The new level itself */
Segment *pLinked = 0; /* Delete separators from this segment */
Level *pDel = 0; /* Delete this entire level */
int nWrite = 0; /* Number of database pages written */
Freelist freelist;
if( eTree!=TREE_NONE ){
rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk);
}
assert( pDb->bUseFreelist==0 );
pDb->pFreelist = &freelist;
pDb->bUseFreelist = 1;
memset(&freelist, 0, sizeof(freelist));
/* Allocate the new level structure to write to. */
pNext = lsmDbSnapshotLevel(pDb->pWorker);
pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
if( pNew ){
pNew->pNext = pNext;
lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
}
/* Create a cursor to gather the data required by the new segment. The new
** segment contains everything in the tree and pointers to the next segment
** in the database (if any). */
pCsr = multiCursorNew(pDb, &rc);
if( pCsr ){
pCsr->pDb = pDb;
rc = multiCursorVisitFreelist(pCsr);
if( rc==LSM_OK ){
rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree);
}
if( rc==LSM_OK && pNext && pNext->pMerge==0 ){
if( (pNext->flags & LEVEL_FREELIST_ONLY) ){
pDel = pNext;
pCsr->aPtr = lsmMallocZeroRc(pDb->pEnv, sizeof(SegmentPtr), &rc);
multiCursorAddOne(pCsr, pNext, &rc);
}else if( eTree!=TREE_NONE && pNext->lhs.iRoot ){
pLinked = &pNext->lhs;
rc = btreeCursorNew(pDb, pLinked, &pCsr->pBtCsr);
}
}
/* If this will be the only segment in the database, discard any delete
** markers present in the in-memory tree. */
if( pNext==0 ){
multiCursorIgnoreDelete(pCsr);
}
}
if( rc!=LSM_OK ){
lsmMCursorClose(pCsr, 0);
}else{
LsmPgno iLeftPtr = 0;
Merge merge; /* Merge object used to create new level */
MergeWorker mergeworker; /* MergeWorker object for the same purpose */
memset(&merge, 0, sizeof(Merge));
memset(&mergeworker, 0, sizeof(MergeWorker));
pNew->pMerge = &merge;
pNew->flags |= LEVEL_INCOMPLETE;
mergeworker.pDb = pDb;
mergeworker.pLevel = pNew;
mergeworker.pCsr = pCsr;
pCsr->pPrevMergePtr = &iLeftPtr;
/* Mark the separators array for the new level as a "phantom". */
mergeworker.bFlush = 1;
/* Do the work to create the new merged segment on disk */
if( rc==LSM_OK ) rc = lsmMCursorFirst(pCsr);
while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){
rc = mergeWorkerStep(&mergeworker);
}
mergeWorkerShutdown(&mergeworker, &rc);
assert( rc!=LSM_OK || mergeworker.nWork==0 || pNew->lhs.iFirst );
if( rc==LSM_OK && pNew->lhs.iFirst ){
rc = lsmFsSortedFinish(pDb->pFS, &pNew->lhs);
}
nWrite = mergeworker.nWork;
pNew->flags &= ~LEVEL_INCOMPLETE;
if( eTree==TREE_NONE ){
pNew->flags |= LEVEL_FREELIST_ONLY;
}
pNew->pMerge = 0;
}
if( rc!=LSM_OK || pNew->lhs.iFirst==0 ){
assert( rc!=LSM_OK || pDb->pWorker->freelist.nEntry==0 );
lsmDbSnapshotSetLevel(pDb->pWorker, pNext);
sortedFreeLevel(pDb->pEnv, pNew);
}else{
if( pLinked ){
pLinked->iRoot = 0;
}else if( pDel ){
assert( pNew->pNext==pDel );
pNew->pNext = pDel->pNext;
lsmFsSortedDelete(pDb->pFS, pDb->pWorker, 1, &pDel->lhs);
sortedFreeLevel(pDb->pEnv, pDel);
}
#if LSM_LOG_STRUCTURE
lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "new-toplevel");
#endif
if( freelist.nEntry ){
Freelist *p = &pDb->pWorker->freelist;
lsmFree(pDb->pEnv, p->aEntry);
memcpy(p, &freelist, sizeof(freelist));
freelist.aEntry = 0;
}else{
pDb->pWorker->freelist.nEntry = 0;
}
assertBtreeOk(pDb, &pNew->lhs);
sortedInvokeWorkHook(pDb);
}
if( pnWrite ) *pnWrite = nWrite;
pDb->pWorker->nWrite += nWrite;
pDb->pFreelist = 0;
pDb->bUseFreelist = 0;
lsmFree(pDb->pEnv, freelist.aEntry);
return rc;
}
/*
** The nMerge levels in the LSM beginning with pLevel consist of a
** left-hand-side segment only. Replace these levels with a single new
** level consisting of a new empty segment on the left-hand-side and the
** nMerge segments from the replaced levels on the right-hand-side.
**
** Also, allocate and populate a Merge object and set Level.pMerge to
** point to it.
*/
static int sortedMergeSetup(
lsm_db *pDb, /* Database handle */
Level *pLevel, /* First level to merge */
int nMerge, /* Merge this many levels together */
Level **ppNew /* New, merged, level */
){
int rc = LSM_OK; /* Return Code */
Level *pNew; /* New Level object */
int bUseNext = 0; /* True to link in next separators */
Merge *pMerge; /* New Merge object */
int nByte; /* Bytes of space allocated at pMerge */
#ifdef LSM_DEBUG
int iLevel;
Level *pX = pLevel;
for(iLevel=0; iLevel<nMerge; iLevel++){
assert( pX->nRight==0 );
pX = pX->pNext;
}
#endif
/* Allocate the new Level object */
pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
if( pNew ){
pNew->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv,
nMerge * sizeof(Segment), &rc);
}
/* Populate the new Level object */
if( rc==LSM_OK ){
Level *pNext = 0; /* Level following pNew */
int i;
int bFreeOnly = 1;
Level *pTopLevel;
Level *p = pLevel;
Level **pp;
pNew->nRight = nMerge;
pNew->iAge = pLevel->iAge+1;
for(i=0; i<nMerge; i++){
assert( p->nRight==0 );
pNext = p->pNext;
pNew->aRhs[i] = p->lhs;
if( (p->flags & LEVEL_FREELIST_ONLY)==0 ) bFreeOnly = 0;
sortedFreeLevel(pDb->pEnv, p);
p = pNext;
}
if( bFreeOnly ) pNew->flags |= LEVEL_FREELIST_ONLY;
/* Replace the old levels with the new. */
pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
pNew->pNext = p;
for(pp=&pTopLevel; *pp!=pLevel; pp=&((*pp)->pNext));
*pp = pNew;
lsmDbSnapshotSetLevel(pDb->pWorker, pTopLevel);
/* Determine whether or not the next separators will be linked in */
if( pNext && pNext->pMerge==0 && pNext->lhs.iRoot && pNext
&& (bFreeOnly==0 || (pNext->flags & LEVEL_FREELIST_ONLY))
){
bUseNext = 1;
}
}
/* Allocate the merge object */
nByte = sizeof(Merge) + sizeof(MergeInput) * (nMerge + bUseNext);
pMerge = (Merge *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
if( pMerge ){
pMerge->aInput = (MergeInput *)&pMerge[1];
pMerge->nInput = nMerge + bUseNext;
pNew->pMerge = pMerge;
}
*ppNew = pNew;
return rc;
}
static int mergeWorkerInit(
lsm_db *pDb, /* Db connection to do merge work */
Level *pLevel, /* Level to work on merging */
MergeWorker *pMW /* Object to initialize */
){
int rc = LSM_OK; /* Return code */
Merge *pMerge = pLevel->pMerge; /* Persistent part of merge state */
MultiCursor *pCsr = 0; /* Cursor opened for pMW */
Level *pNext = pLevel->pNext; /* Next level in LSM */
assert( pDb->pWorker );
assert( pLevel->pMerge );
assert( pLevel->nRight>0 );
memset(pMW, 0, sizeof(MergeWorker));
pMW->pDb = pDb;
pMW->pLevel = pLevel;
pMW->aGobble = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmPgno)*pLevel->nRight,&rc);
/* Create a multi-cursor to read the data to write to the new
** segment. The new segment contains:
**
** 1. Records from LHS of each of the nMerge levels being merged.
** 2. Separators from either the last level being merged, or the
** separators attached to the LHS of the following level, or neither.
**
** If the new level is the lowest (oldest) in the db, discard any
** delete keys. Key annihilation.
*/
pCsr = multiCursorNew(pDb, &rc);
if( pCsr ){
pCsr->flags |= CURSOR_NEXT_OK;
rc = multiCursorAddRhs(pCsr, pLevel);
}
if( rc==LSM_OK && pMerge->nInput > pLevel->nRight ){
rc = btreeCursorNew(pDb, &pNext->lhs, &pCsr->pBtCsr);
}else if( pNext ){
multiCursorReadSeparators(pCsr);
}else{
multiCursorIgnoreDelete(pCsr);
}
assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) );
pMW->pCsr = pCsr;
/* Load the b-tree hierarchy into memory. */
if( rc==LSM_OK ) rc = mergeWorkerLoadHierarchy(pMW);
if( rc==LSM_OK && pMW->hier.nHier==0 ){
pMW->aSave[0].iPgno = pLevel->lhs.iFirst;
}
/* Position the cursor. */
if( rc==LSM_OK ){
pCsr->pPrevMergePtr = &pMerge->iCurrentPtr;
if( pLevel->lhs.iFirst==0 ){
/* The output array is still empty. So position the cursor at the very
** start of the input. */
rc = multiCursorEnd(pCsr, 0);
}else{
/* The output array is non-empty. Position the cursor based on the
** page/cell data saved in the Merge.aInput[] array. */
int i;
for(i=0; rc==LSM_OK && i<pCsr->nPtr; i++){
MergeInput *pInput = &pMerge->aInput[i];
if( pInput->iPg ){
SegmentPtr *pPtr;
assert( pCsr->aPtr[i].pPg==0 );
pPtr = &pCsr->aPtr[i];
rc = segmentPtrLoadPage(pDb->pFS, pPtr, pInput->iPg);
if( rc==LSM_OK && pPtr->nCell>0 ){
rc = segmentPtrLoadCell(pPtr, pInput->iCell);
}
}
}
if( rc==LSM_OK && pCsr->pBtCsr ){
int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
assert( i==pCsr->nPtr );
rc = btreeCursorRestore(pCsr->pBtCsr, xCmp, &pMerge->aInput[i]);
}
if( rc==LSM_OK ){
rc = multiCursorSetupTree(pCsr, 0);
}
}
pCsr->flags |= CURSOR_NEXT_OK;
}
return rc;
}
static int sortedBtreeGobble(
lsm_db *pDb, /* Worker connection */
MultiCursor *pCsr, /* Multi-cursor being used for a merge */
int iGobble /* pCsr->aPtr[] entry to operate on */
){
int rc = LSM_OK;
if( rtTopic(pCsr->eType)==0 ){
Segment *pSeg = pCsr->aPtr[iGobble].pSeg;
LsmPgno *aPg;
int nPg;
/* Seek from the root of the b-tree to the segment leaf that may contain
** a key equal to the one multi-cursor currently points to. Record the
** page number of each b-tree page and the leaf. The segment may be
** gobbled up to (but not including) the first of these page numbers.
*/
assert( pSeg->iRoot>0 );
aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmPgno)*32, &rc);
if( rc==LSM_OK ){
rc = seekInBtree(pCsr, pSeg,
rtTopic(pCsr->eType), pCsr->key.pData, pCsr->key.nData, aPg, 0
);
}
if( rc==LSM_OK ){
for(nPg=0; aPg[nPg]; nPg++);
lsmFsGobble(pDb, pSeg, aPg, nPg);
}
lsmFree(pDb->pEnv, aPg);
}
return rc;
}
/*
** Argument p points to a level of age N. Return the number of levels in
** the linked list starting at p that have age=N (always at least 1).
*/
static int sortedCountLevels(Level *p){
int iAge = p->iAge;
int nRet = 0;
do {
nRet++;
p = p->pNext;
}while( p && p->iAge==iAge );
return nRet;
}
static int sortedSelectLevel(lsm_db *pDb, int nMerge, Level **ppOut){
Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
int rc = LSM_OK;
Level *pLevel = 0; /* Output value */
Level *pBest = 0; /* Best level to work on found so far */
int nBest; /* Number of segments merged at pBest */
Level *pThis = 0; /* First in run of levels with age=iAge */
int nThis = 0; /* Number of levels starting at pThis */
assert( nMerge>=1 );
nBest = LSM_MAX(1, nMerge-1);
/* Find the longest contiguous run of levels not currently undergoing a
** merge with the same age in the structure. Or the level being merged
** with the largest number of right-hand segments. Work on it. */
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
if( pLevel->nRight==0 && pThis && pLevel->iAge==pThis->iAge ){
nThis++;
}else{
if( nThis>nBest ){
if( (pLevel->iAge!=pThis->iAge+1)
|| (pLevel->nRight==0 && sortedCountLevels(pLevel)<=pDb->nMerge)
){
pBest = pThis;
nBest = nThis;
}
}
if( pLevel->nRight ){
if( pLevel->nRight>nBest ){
nBest = pLevel->nRight;
pBest = pLevel;
}
nThis = 0;
pThis = 0;
}else{
pThis = pLevel;
nThis = 1;
}
}
}
if( nThis>nBest ){
assert( pThis );
pBest = pThis;
nBest = nThis;
}
if( pBest==0 && nMerge==1 ){
int nFree = 0;
int nUsr = 0;
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
assert( !pLevel->nRight );
if( pLevel->flags & LEVEL_FREELIST_ONLY ){
nFree++;
}else{
nUsr++;
}
}
if( nUsr>1 ){
pBest = pTopLevel;
nBest = nFree + nUsr;
}
}
if( pBest ){
if( pBest->nRight==0 ){
rc = sortedMergeSetup(pDb, pBest, nBest, ppOut);
}else{
*ppOut = pBest;
}
}
return rc;
}
static int sortedDbIsFull(lsm_db *pDb){
Level *pTop = lsmDbSnapshotLevel(pDb->pWorker);
if( lsmDatabaseFull(pDb) ) return 1;
if( pTop && pTop->iAge==0
&& (pTop->nRight || sortedCountLevels(pTop)>=pDb->nMerge)
){
return 1;
}
return 0;
}
typedef struct MoveBlockCtx MoveBlockCtx;
struct MoveBlockCtx {
int iSeen; /* Previous free block on list */
int iFrom; /* Total number of blocks in file */
};
static int moveBlockCb(void *pCtx, int iBlk, i64 iSnapshot){
MoveBlockCtx *p = (MoveBlockCtx *)pCtx;
assert( p->iFrom==0 );
if( iBlk==(p->iSeen-1) ){
p->iSeen = iBlk;
return 0;
}
p->iFrom = p->iSeen-1;
return 1;
}
/*
** This function is called to further compact a database for which all
** of the content has already been merged into a single segment. If
** possible, it moves the contents of a single block from the end of the
** file to a free-block that lies closer to the start of the file (allowing
** the file to be eventually truncated).
*/
static int sortedMoveBlock(lsm_db *pDb, int *pnWrite){
Snapshot *p = pDb->pWorker;
Level *pLvl = lsmDbSnapshotLevel(p);
int iFrom; /* Block to move */
int iTo; /* Destination to move block to */
int rc; /* Return code */
MoveBlockCtx sCtx;
assert( pLvl->pNext==0 && pLvl->nRight==0 );
assert( p->redirect.n<=LSM_MAX_BLOCK_REDIRECTS );
*pnWrite = 0;
/* Check that the redirect array is not already full. If it is, return
** without moving any database content. */
if( p->redirect.n>=LSM_MAX_BLOCK_REDIRECTS ) return LSM_OK;
/* Find the last block of content in the database file. Do this by
** traversing the free-list in reverse (descending block number) order.
** The first block not on the free list is the one that will be moved.
** Since the db consists of a single segment, there is no ambiguity as
** to which segment the block belongs to. */
sCtx.iSeen = p->nBlock+1;
sCtx.iFrom = 0;
rc = lsmWalkFreelist(pDb, 1, moveBlockCb, &sCtx);
if( rc!=LSM_OK || sCtx.iFrom==0 ) return rc;
iFrom = sCtx.iFrom;
/* Find the first free block in the database, ignoring block 1. Block
** 1 is tricky as it is smaller than the other blocks. */
rc = lsmBlockAllocate(pDb, iFrom, &iTo);
if( rc!=LSM_OK || iTo==0 ) return rc;
assert( iTo!=1 && iTo<iFrom );
rc = lsmFsMoveBlock(pDb->pFS, &pLvl->lhs, iTo, iFrom);
if( rc==LSM_OK ){
if( p->redirect.a==0 ){
int nByte = sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS;
p->redirect.a = lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
}
if( rc==LSM_OK ){
/* Check if the block just moved was already redirected. */
int i;
for(i=0; i<p->redirect.n; i++){
if( p->redirect.a[i].iTo==iFrom ) break;
}
if( i==p->redirect.n ){
/* Block iFrom was not already redirected. Add a new array entry. */
memmove(&p->redirect.a[1], &p->redirect.a[0],
sizeof(struct RedirectEntry) * p->redirect.n
);
p->redirect.a[0].iFrom = iFrom;
p->redirect.a[0].iTo = iTo;
p->redirect.n++;
}else{
/* Block iFrom was already redirected. Overwrite existing entry. */
p->redirect.a[i].iTo = iTo;
}
rc = lsmBlockFree(pDb, iFrom);
*pnWrite = lsmFsBlockSize(pDb->pFS) / lsmFsPageSize(pDb->pFS);
pLvl->lhs.pRedirect = &p->redirect;
}
}
#if LSM_LOG_STRUCTURE
if( rc==LSM_OK ){
char aBuf[64];
sprintf(aBuf, "move-block %d/%d", p->redirect.n-1, LSM_MAX_BLOCK_REDIRECTS);
lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, aBuf);
}
#endif
return rc;
}
/*
*/
static int mergeInsertFreelistSegments(
lsm_db *pDb,
int nFree,
MergeWorker *pMW
){
int rc = LSM_OK;
if( nFree>0 ){
MultiCursor *pCsr = pMW->pCsr;
Level *pLvl = pMW->pLevel;
SegmentPtr *aNew1;
Segment *aNew2;
Level *pIter;
Level *pNext;
int i = 0;
aNew1 = (SegmentPtr *)lsmMallocZeroRc(
pDb->pEnv, sizeof(SegmentPtr) * (pCsr->nPtr+nFree), &rc
);
if( rc ) return rc;
memcpy(&aNew1[nFree], pCsr->aPtr, sizeof(SegmentPtr)*pCsr->nPtr);
pCsr->nPtr += nFree;
lsmFree(pDb->pEnv, pCsr->aTree);
lsmFree(pDb->pEnv, pCsr->aPtr);
pCsr->aTree = 0;
pCsr->aPtr = aNew1;
aNew2 = (Segment *)lsmMallocZeroRc(
pDb->pEnv, sizeof(Segment) * (pLvl->nRight+nFree), &rc
);
if( rc ) return rc;
memcpy(&aNew2[nFree], pLvl->aRhs, sizeof(Segment)*pLvl->nRight);
pLvl->nRight += nFree;
lsmFree(pDb->pEnv, pLvl->aRhs);
pLvl->aRhs = aNew2;
for(pIter=pDb->pWorker->pLevel; rc==LSM_OK && pIter!=pLvl; pIter=pNext){
Segment *pSeg = &pLvl->aRhs[i];
memcpy(pSeg, &pIter->lhs, sizeof(Segment));
pCsr->aPtr[i].pSeg = pSeg;
pCsr->aPtr[i].pLevel = pLvl;
rc = segmentPtrEnd(pCsr, &pCsr->aPtr[i], 0);
pDb->pWorker->pLevel = pNext = pIter->pNext;
sortedFreeLevel(pDb->pEnv, pIter);
i++;
}
assert( i==nFree );
assert( rc!=LSM_OK || pDb->pWorker->pLevel==pLvl );
for(i=nFree; i<pCsr->nPtr; i++){
pCsr->aPtr[i].pSeg = &pLvl->aRhs[i];
}
lsmFree(pDb->pEnv, pMW->aGobble);
pMW->aGobble = 0;
}
return rc;
}
static int sortedWork(
lsm_db *pDb, /* Database handle. Must be worker. */
int nWork, /* Number of pages of work to do */
int nMerge, /* Try to merge this many levels at once */
int bFlush, /* Set if call is to make room for a flush */
int *pnWrite /* OUT: Actual number of pages written */
){
int rc = LSM_OK; /* Return Code */
int nRemaining = nWork; /* Units of work to do before returning */
Snapshot *pWorker = pDb->pWorker;
assert( pWorker );
if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK;
while( nRemaining>0 ){
Level *pLevel = 0;
/* Find a level to work on. */
rc = sortedSelectLevel(pDb, nMerge, &pLevel);
assert( rc==LSM_OK || pLevel==0 );
if( pLevel==0 ){
int nDone = 0;
Level *pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
if( bFlush==0 && nMerge==1 && pTopLevel && pTopLevel->pNext==0 ){
rc = sortedMoveBlock(pDb, &nDone);
}
nRemaining -= nDone;
/* Could not find any work to do. Finished. */
if( nDone==0 ) break;
}else{
int bSave = 0;
Freelist freelist = {0, 0, 0};
MergeWorker mergeworker; /* State used to work on the level merge */
assert( pDb->bIncrMerge==0 );
assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 );
pDb->bIncrMerge = 1;
rc = mergeWorkerInit(pDb, pLevel, &mergeworker);
assert( mergeworker.nWork==0 );
while( rc==LSM_OK
&& 0==mergeWorkerDone(&mergeworker)
&& (mergeworker.nWork<nRemaining || pDb->bUseFreelist)
){
int eType = rtTopic(mergeworker.pCsr->eType);
rc = mergeWorkerStep(&mergeworker);
/* If the cursor now points at the first entry past the end of the
** user data (i.e. either to EOF or to the first free-list entry
** that will be added to the run), then check if it is possible to
** merge in any free-list entries that are either in-memory or in
** free-list-only blocks. */
if( rc==LSM_OK && nMerge==1 && eType==0
&& (rtTopic(mergeworker.pCsr->eType) || mergeWorkerDone(&mergeworker))
){
int nFree = 0; /* Number of free-list-only levels to merge */
Level *pLvl;
assert( pDb->pFreelist==0 && pDb->bUseFreelist==0 );
/* Now check if all levels containing data newer than this one
** are single-segment free-list only levels. If so, they will be
** merged in now. */
for(pLvl=pDb->pWorker->pLevel;
pLvl!=mergeworker.pLevel && (pLvl->flags & LEVEL_FREELIST_ONLY);
pLvl=pLvl->pNext
){
assert( pLvl->nRight==0 );
nFree++;
}
if( pLvl==mergeworker.pLevel ){
rc = mergeInsertFreelistSegments(pDb, nFree, &mergeworker);
if( rc==LSM_OK ){
rc = multiCursorVisitFreelist(mergeworker.pCsr);
}
if( rc==LSM_OK ){
rc = multiCursorSetupTree(mergeworker.pCsr, 0);
pDb->pFreelist = &freelist;
pDb->bUseFreelist = 1;
}
}
}
}
nRemaining -= LSM_MAX(mergeworker.nWork, 1);
if( rc==LSM_OK ){
/* Check if the merge operation is completely finished. If not,
** gobble up (declare eligible for recycling) any pages from rhs
** segments for which the content has been completely merged into
** the lhs of the level. */
if( mergeWorkerDone(&mergeworker)==0 ){
int i;
for(i=0; i<pLevel->nRight; i++){
SegmentPtr *pGobble = &mergeworker.pCsr->aPtr[i];
if( pGobble->pSeg->iRoot ){
rc = sortedBtreeGobble(pDb, mergeworker.pCsr, i);
}else if( mergeworker.aGobble[i] ){
lsmFsGobble(pDb, pGobble->pSeg, &mergeworker.aGobble[i], 1);
}
}
}else{
int i;
int bEmpty;
mergeWorkerShutdown(&mergeworker, &rc);
bEmpty = (pLevel->lhs.iFirst==0);
if( bEmpty==0 && rc==LSM_OK ){
rc = lsmFsSortedFinish(pDb->pFS, &pLevel->lhs);
}
if( pDb->bUseFreelist ){
Freelist *p = &pDb->pWorker->freelist;
lsmFree(pDb->pEnv, p->aEntry);
memcpy(p, &freelist, sizeof(freelist));
pDb->bUseFreelist = 0;
pDb->pFreelist = 0;
bSave = 1;
}
for(i=0; i<pLevel->nRight; i++){
lsmFsSortedDelete(pDb->pFS, pWorker, 1, &pLevel->aRhs[i]);
}
if( bEmpty ){
/* If the new level is completely empty, remove it from the
** database snapshot. This can only happen if all input keys were
** annihilated. Since keys are only annihilated if the new level
** is the last in the linked list (contains the most ancient of
** database content), this guarantees that pLevel->pNext==0. */
Level *pTop; /* Top level of worker snapshot */
Level **pp; /* Read/write iterator for Level.pNext list */
assert( pLevel->pNext==0 );
/* Remove the level from the worker snapshot. */
pTop = lsmDbSnapshotLevel(pWorker);
for(pp=&pTop; *pp!=pLevel; pp=&((*pp)->pNext));
*pp = pLevel->pNext;
lsmDbSnapshotSetLevel(pWorker, pTop);
/* Free the Level structure. */
sortedFreeLevel(pDb->pEnv, pLevel);
}else{
/* Free the separators of the next level, if required. */
if( pLevel->pMerge->nInput > pLevel->nRight ){
assert( pLevel->pNext->lhs.iRoot );
pLevel->pNext->lhs.iRoot = 0;
}
/* Zero the right-hand-side of pLevel */
lsmFree(pDb->pEnv, pLevel->aRhs);
pLevel->nRight = 0;
pLevel->aRhs = 0;
/* Free the Merge object */
lsmFree(pDb->pEnv, pLevel->pMerge);
pLevel->pMerge = 0;
}
if( bSave && rc==LSM_OK ){
pDb->bIncrMerge = 0;
rc = lsmSaveWorker(pDb, 0);
}
}
}
/* Clean up the MergeWorker object initialized above. If no error
** has occurred, invoke the work-hook to inform the application that
** the database structure has changed. */
mergeWorkerShutdown(&mergeworker, &rc);
pDb->bIncrMerge = 0;
if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);
#if LSM_LOG_STRUCTURE
lsmSortedDumpStructure(pDb, pDb->pWorker, LSM_LOG_DATA, 0, "work");
#endif
assertBtreeOk(pDb, &pLevel->lhs);
assertRunInOrder(pDb, &pLevel->lhs);
/* If bFlush is true and the database is no longer considered "full",
** break out of the loop even if nRemaining is still greater than
** zero. The caller has an in-memory tree to flush to disk. */
if( bFlush && sortedDbIsFull(pDb)==0 ) break;
}
}
if( pnWrite ) *pnWrite = (nWork - nRemaining);
pWorker->nWrite += (nWork - nRemaining);
#ifdef LSM_LOG_WORK
lsmLogMessage(pDb, rc, "sortedWork(): %d pages", (nWork-nRemaining));
#endif
return rc;
}
/*
** The database connection passed as the first argument must be a worker
** connection. This function checks if there exists an "old" in-memory tree
** ready to be flushed to disk. If so, true is returned. Otherwise false.
**
** If an error occurs, *pRc is set to an LSM error code before returning.
** It is assumed that *pRc is set to LSM_OK when this function is called.
*/
static int sortedTreeHasOld(lsm_db *pDb, int *pRc){
int rc = LSM_OK;
int bRet = 0;
assert( pDb->pWorker );
if( *pRc==LSM_OK ){
if( rc==LSM_OK
&& pDb->treehdr.iOldShmid
&& pDb->treehdr.iOldLog!=pDb->pWorker->iLogOff
){
bRet = 1;
}else{
bRet = 0;
}
*pRc = rc;
}
assert( *pRc==LSM_OK || bRet==0 );
return bRet;
}
/*
** Create a new free-list only top-level segment. Return LSM_OK if successful
** or an LSM error code if some error occurs.
*/
static int sortedNewFreelistOnly(lsm_db *pDb){
return sortedNewToplevel(pDb, TREE_NONE, 0);
}
int lsmSaveWorker(lsm_db *pDb, int bFlush){
Snapshot *p = pDb->pWorker;
if( p->freelist.nEntry>pDb->nMaxFreelist ){
int rc = sortedNewFreelistOnly(pDb);
if( rc!=LSM_OK ) return rc;
}
return lsmCheckpointSaveWorker(pDb, bFlush);
}
static int doLsmSingleWork(
lsm_db *pDb,
int bShutdown,
int nMerge, /* Minimum segments to merge together */
int nPage, /* Number of pages to write to disk */
int *pnWrite, /* OUT: Pages actually written to disk */
int *pbCkpt /* OUT: True if an auto-checkpoint is req. */
){
Snapshot *pWorker; /* Worker snapshot */
int rc = LSM_OK; /* Return code */
int bDirty = 0;
int nMax = nPage; /* Maximum pages to write to disk */
int nRem = nPage;
int bCkpt = 0;
assert( nPage>0 );
/* Open the worker 'transaction'. It will be closed before this function
** returns. */
assert( pDb->pWorker==0 );
rc = lsmBeginWork(pDb);
if( rc!=LSM_OK ) return rc;
pWorker = pDb->pWorker;
/* If this connection is doing auto-checkpoints, set nMax (and nRem) so
** that this call stops writing when the auto-checkpoint is due. The
** caller will do the checkpoint, then possibly call this function again. */
if( bShutdown==0 && pDb->nAutockpt ){
u32 nSync;
u32 nUnsync;
int nPgsz;
lsmCheckpointSynced(pDb, 0, 0, &nSync);
nUnsync = lsmCheckpointNWrite(pDb->pShmhdr->aSnap1, 0);
nPgsz = lsmCheckpointPgsz(pDb->pShmhdr->aSnap1);
nMax = (int)LSM_MIN(nMax, (pDb->nAutockpt/nPgsz) - (int)(nUnsync-nSync));
if( nMax<nRem ){
bCkpt = 1;
nRem = LSM_MAX(nMax, 0);
}
}
/* If there exists in-memory data ready to be flushed to disk, attempt
** to flush it now. */
if( pDb->nTransOpen==0 ){
rc = lsmTreeLoadHeader(pDb, 0);
}
if( sortedTreeHasOld(pDb, &rc) ){
/* sortedDbIsFull() returns non-zero if either (a) there are too many
** levels in total in the db, or (b) there are too many levels with the
** the same age in the db. Either way, call sortedWork() to merge
** existing segments together until this condition is cleared. */
if( sortedDbIsFull(pDb) ){
int nPg = 0;
rc = sortedWork(pDb, nRem, nMerge, 1, &nPg);
nRem -= nPg;
assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) );
bDirty = 1;
}
if( rc==LSM_OK && nRem>0 ){
int nPg = 0;
rc = sortedNewToplevel(pDb, TREE_OLD, &nPg);
nRem -= nPg;
if( rc==LSM_OK ){
if( pDb->nTransOpen>0 ){
lsmTreeDiscardOld(pDb);
}
rc = lsmSaveWorker(pDb, 1);
bDirty = 0;
}
}
}
/* If nPage is still greater than zero, do some merging. */
if( rc==LSM_OK && nRem>0 && bShutdown==0 ){
int nPg = 0;
rc = sortedWork(pDb, nRem, nMerge, 0, &nPg);
nRem -= nPg;
if( nPg ) bDirty = 1;
}
/* If the in-memory part of the free-list is too large, write a new
** top-level containing just the in-memory free-list entries to disk. */
if( rc==LSM_OK && pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist ){
while( rc==LSM_OK && lsmDatabaseFull(pDb) ){
int nPg = 0;
rc = sortedWork(pDb, 16, nMerge, 1, &nPg);
nRem -= nPg;
}
if( rc==LSM_OK ){
rc = sortedNewFreelistOnly(pDb);
}
bDirty = 1;
}
if( rc==LSM_OK ){
*pnWrite = (nMax - nRem);
*pbCkpt = (bCkpt && nRem<=0);
if( nMerge==1 && pDb->nAutockpt>0 && *pnWrite>0
&& pWorker->pLevel
&& pWorker->pLevel->nRight==0
&& pWorker->pLevel->pNext==0
){
*pbCkpt = 1;
}
}
if( rc==LSM_OK && bDirty ){
lsmFinishWork(pDb, 0, &rc);
}else{
int rcdummy = LSM_BUSY;
lsmFinishWork(pDb, 0, &rcdummy);
*pnWrite = 0;
}
assert( pDb->pWorker==0 );
return rc;
}
static int doLsmWork(lsm_db *pDb, int nMerge, int nPage, int *pnWrite){
int rc = LSM_OK; /* Return code */
int nWrite = 0; /* Number of pages written */
assert( nMerge>=1 );
if( nPage!=0 ){
int bCkpt = 0;
do {
int nThis = 0;
int nReq = (nPage>=0) ? (nPage-nWrite) : ((int)0x7FFFFFFF);
bCkpt = 0;
rc = doLsmSingleWork(pDb, 0, nMerge, nReq, &nThis, &bCkpt);
nWrite += nThis;
if( rc==LSM_OK && bCkpt ){
rc = lsm_checkpoint(pDb, 0);
}
}while( rc==LSM_OK && bCkpt && (nWrite<nPage || nPage<0) );
}
if( pnWrite ){
if( rc==LSM_OK ){
*pnWrite = nWrite;
}else{
*pnWrite = 0;
}
}
return rc;
}
/*
** Perform work to merge database segments together.
*/
int lsm_work(lsm_db *pDb, int nMerge, int nKB, int *pnWrite){
int rc; /* Return code */
int nPgsz; /* Nominal page size in bytes */
int nPage; /* Equivalent of nKB in pages */
int nWrite = 0; /* Number of pages written */
/* This function may not be called if pDb has an open read or write
** transaction. Return LSM_MISUSE if an application attempts this. */
if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT;
if( nMerge<=0 ) nMerge = pDb->nMerge;
lsmFsPurgeCache(pDb->pFS);
/* Convert from KB to pages */
nPgsz = lsmFsPageSize(pDb->pFS);
if( nKB>=0 ){
nPage = ((i64)nKB * 1024 + nPgsz - 1) / nPgsz;
}else{
nPage = -1;
}
rc = doLsmWork(pDb, nMerge, nPage, &nWrite);
if( pnWrite ){
/* Convert back from pages to KB */
*pnWrite = (int)(((i64)nWrite * 1024 + nPgsz - 1) / nPgsz);
}
return rc;
}
int lsm_flush(lsm_db *db){
int rc;
if( db->nTransOpen>0 || db->pCsr ){
rc = LSM_MISUSE_BKPT;
}else{
rc = lsmBeginWriteTrans(db);
if( rc==LSM_OK ){
lsmFlushTreeToDisk(db);
lsmTreeDiscardOld(db);
lsmTreeMakeOld(db);
lsmTreeDiscardOld(db);
}
if( rc==LSM_OK ){
rc = lsmFinishWriteTrans(db, 1);
}else{
lsmFinishWriteTrans(db, 0);
}
lsmFinishReadTrans(db);
}
return rc;
}
/*
** This function is called in auto-work mode to perform merging work on
** the data structure. It performs enough merging work to prevent the
** height of the tree from growing indefinitely assuming that roughly
** nUnit database pages worth of data have been written to the database
** (i.e. the in-memory tree) since the last call.
*/
int lsmSortedAutoWork(
lsm_db *pDb, /* Database handle */
int nUnit /* Pages of data written to in-memory tree */
){
int rc = LSM_OK; /* Return code */
int nDepth = 0; /* Current height of tree (longest path) */
Level *pLevel; /* Used to iterate through levels */
int bRestore = 0;
assert( pDb->pWorker==0 );
assert( pDb->nTransOpen>0 );
/* Determine how many units of work to do before returning. One unit of
** work is achieved by writing one page (~4KB) of merged data. */
for(pLevel=lsmDbSnapshotLevel(pDb->pClient); pLevel; pLevel=pLevel->pNext){
/* nDepth += LSM_MAX(1, pLevel->nRight); */
nDepth += 1;
}
if( lsmTreeHasOld(pDb) ){
nDepth += 1;
bRestore = 1;
rc = lsmSaveCursors(pDb);
if( rc!=LSM_OK ) return rc;
}
if( nDepth>0 ){
int nRemaining; /* Units of work to do before returning */
nRemaining = nUnit * nDepth;
#ifdef LSM_LOG_WORK
lsmLogMessage(pDb, rc, "lsmSortedAutoWork(): %d*%d = %d pages",
nUnit, nDepth, nRemaining);
#endif
assert( nRemaining>=0 );
rc = doLsmWork(pDb, pDb->nMerge, nRemaining, 0);
if( rc==LSM_BUSY ) rc = LSM_OK;
if( bRestore && pDb->pCsr ){
lsmMCursorFreeCache(pDb);
lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
pDb->pClient = 0;
if( rc==LSM_OK ){
rc = lsmCheckpointLoad(pDb, 0);
}
if( rc==LSM_OK ){
rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
}
if( rc==LSM_OK ){
rc = lsmRestoreCursors(pDb);
}
}
}
return rc;
}
/*
** This function is only called during system shutdown. The contents of
** any in-memory trees present (old or current) are written out to disk.
*/
int lsmFlushTreeToDisk(lsm_db *pDb){
int rc;
rc = lsmBeginWork(pDb);
while( rc==LSM_OK && sortedDbIsFull(pDb) ){
rc = sortedWork(pDb, 256, pDb->nMerge, 1, 0);
}
if( rc==LSM_OK ){
rc = sortedNewToplevel(pDb, TREE_BOTH, 0);
}
lsmFinishWork(pDb, 1, &rc);
return rc;
}
/*
** Return a string representation of the segment passed as the only argument.
** Space for the returned string is allocated using lsmMalloc(), and should
** be freed by the caller using lsmFree().
*/
static char *segToString(lsm_env *pEnv, Segment *pSeg, int nMin){
LsmPgno nSize = pSeg->nSize;
LsmPgno iRoot = pSeg->iRoot;
LsmPgno iFirst = pSeg->iFirst;
LsmPgno iLast = pSeg->iLastPg;
char *z;
char *z1;
char *z2;
int nPad;
z1 = lsmMallocPrintf(pEnv, "%d.%d", iFirst, iLast);
if( iRoot ){
z2 = lsmMallocPrintf(pEnv, "root=%lld", iRoot);
}else{
z2 = lsmMallocPrintf(pEnv, "size=%lld", nSize);
}
nPad = nMin - 2 - strlen(z1) - 1 - strlen(z2);
nPad = LSM_MAX(0, nPad);
if( iRoot ){
z = lsmMallocPrintf(pEnv, "/%s %*s%s\\", z1, nPad, "", z2);
}else{
z = lsmMallocPrintf(pEnv, "|%s %*s%s|", z1, nPad, "", z2);
}
lsmFree(pEnv, z1);
lsmFree(pEnv, z2);
return z;
}
static int fileToString(
lsm_db *pDb, /* For xMalloc() */
char *aBuf,
int nBuf,
int nMin,
Segment *pSeg
){
int i = 0;
if( pSeg ){
char *zSeg;
zSeg = segToString(pDb->pEnv, pSeg, nMin);
snprintf(&aBuf[i], nBuf-i, "%s", zSeg);
i += strlen(&aBuf[i]);
lsmFree(pDb->pEnv, zSeg);
#ifdef LSM_LOG_FREELIST
lsmInfoArrayStructure(pDb, 1, pSeg->iFirst, &zSeg);
snprintf(&aBuf[i], nBuf-1, " (%s)", zSeg);
i += strlen(&aBuf[i]);
lsmFree(pDb->pEnv, zSeg);
#endif
aBuf[nBuf] = 0;
}else{
aBuf[0] = '\0';
}
return i;
}
void sortedDumpPage(lsm_db *pDb, Segment *pRun, Page *pPg, int bVals){
LsmBlob blob = {0, 0, 0}; /* LsmBlob used for keys */
LsmString s;
int i;
int nRec;
LsmPgno iPtr;
int flags;
u8 *aData;
int nData;
aData = fsPageData(pPg, &nData);
nRec = pageGetNRec(aData, nData);
iPtr = pageGetPtr(aData, nData);
flags = pageGetFlags(aData, nData);
lsmStringInit(&s, pDb->pEnv);
lsmStringAppendf(&s,"nCell=%d iPtr=%lld flags=%d {", nRec, iPtr, flags);
if( flags&SEGMENT_BTREE_FLAG ) iPtr = 0;
for(i=0; i<nRec; i++){
Page *pRef = 0; /* Pointer to page iRef */
int iChar;
u8 *aKey; int nKey = 0; /* Key */
u8 *aVal = 0; int nVal = 0; /* Value */
int iTopic;
u8 *aCell;
i64 iPgPtr;
int eType;
aCell = pageGetCell(aData, nData, i);
eType = *aCell++;
assert( (flags & SEGMENT_BTREE_FLAG) || eType!=0 );
aCell += lsmVarintGet64(aCell, &iPgPtr);
if( eType==0 ){
LsmPgno iRef; /* Page number of referenced page */
aCell += lsmVarintGet64(aCell, &iRef);
lsmFsDbPageGet(pDb->pFS, pRun, iRef, &pRef);
aKey = pageGetKey(pRun, pRef, 0, &iTopic, &nKey, &blob);
}else{
aCell += lsmVarintGet32(aCell, &nKey);
if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
sortedReadData(0, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, &blob);
aVal = &aKey[nKey];
iTopic = eType;
}
lsmStringAppendf(&s, "%s%2X:", (i==0?"":" "), iTopic);
for(iChar=0; iChar<nKey; iChar++){
lsmStringAppendf(&s, "%c", isalnum(aKey[iChar]) ? aKey[iChar] : '.');
}
if( nVal>0 && bVals ){
lsmStringAppendf(&s, "##");
for(iChar=0; iChar<nVal; iChar++){
lsmStringAppendf(&s, "%c", isalnum(aVal[iChar]) ? aVal[iChar] : '.');
}
}
lsmStringAppendf(&s, " %lld", iPgPtr+iPtr);
lsmFsPageRelease(pRef);
}
lsmStringAppend(&s, "}", 1);
lsmLogMessage(pDb, LSM_OK, " Page %d: %s", lsmFsPageNumber(pPg), s.z);
lsmStringClear(&s);
sortedBlobFree(&blob);
}
static void infoCellDump(
lsm_db *pDb, /* Database handle */
Segment *pSeg, /* Segment page belongs to */
int bIndirect, /* True to follow indirect refs */
Page *pPg,
int iCell,
int *peType,
int *piPgPtr,
u8 **paKey, int *pnKey,
u8 **paVal, int *pnVal,
LsmBlob *pBlob
){
u8 *aData; int nData; /* Page data */
u8 *aKey; int nKey = 0; /* Key */
u8 *aVal = 0; int nVal = 0; /* Value */
int eType;
int iPgPtr;
Page *pRef = 0; /* Pointer to page iRef */
u8 *aCell;
aData = fsPageData(pPg, &nData);
aCell = pageGetCell(aData, nData, iCell);
eType = *aCell++;
aCell += lsmVarintGet32(aCell, &iPgPtr);
if( eType==0 ){
int dummy;
LsmPgno iRef; /* Page number of referenced page */
aCell += lsmVarintGet64(aCell, &iRef);
if( bIndirect ){
lsmFsDbPageGet(pDb->pFS, pSeg, iRef, &pRef);
pageGetKeyCopy(pDb->pEnv, pSeg, pRef, 0, &dummy, pBlob);
aKey = (u8 *)pBlob->pData;
nKey = pBlob->nData;
lsmFsPageRelease(pRef);
}else{
aKey = (u8 *)"<indirect>";
nKey = 11;
}
}else{
aCell += lsmVarintGet32(aCell, &nKey);
if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal);
sortedReadData(pSeg, pPg, (aCell-aData), nKey+nVal, (void **)&aKey, pBlob);
aVal = &aKey[nKey];
}
if( peType ) *peType = eType;
if( piPgPtr ) *piPgPtr = iPgPtr;
if( paKey ) *paKey = aKey;
if( paVal ) *paVal = aVal;
if( pnKey ) *pnKey = nKey;
if( pnVal ) *pnVal = nVal;
}
static int infoAppendBlob(LsmString *pStr, int bHex, u8 *z, int n){
int iChar;
for(iChar=0; iChar<n; iChar++){
if( bHex ){
lsmStringAppendf(pStr, "%02X", z[iChar]);
}else{
lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.');
}
}
return LSM_OK;
}
#define INFO_PAGE_DUMP_DATA 0x01
#define INFO_PAGE_DUMP_VALUES 0x02
#define INFO_PAGE_DUMP_HEX 0x04
#define INFO_PAGE_DUMP_INDIRECT 0x08
static int infoPageDump(
lsm_db *pDb, /* Database handle */
LsmPgno iPg, /* Page number of page to dump */
int flags,
char **pzOut /* OUT: lsmMalloc'd string */
){
int rc = LSM_OK; /* Return code */
Page *pPg = 0; /* Handle for page iPg */
int i, j; /* Loop counters */
const int perLine = 16; /* Bytes per line in the raw hex dump */
Segment *pSeg = 0;
Snapshot *pSnap;
int bValues = (flags & INFO_PAGE_DUMP_VALUES);
int bHex = (flags & INFO_PAGE_DUMP_HEX);
int bData = (flags & INFO_PAGE_DUMP_DATA);
int bIndirect = (flags & INFO_PAGE_DUMP_INDIRECT);
*pzOut = 0;
if( iPg==0 ) return LSM_ERROR;
assert( pDb->pClient || pDb->pWorker );
pSnap = pDb->pClient;
if( pSnap==0 ) pSnap = pDb->pWorker;
if( pSnap->redirect.n>0 ){
Level *pLvl;
int bUse = 0;
for(pLvl=pSnap->pLevel; pLvl->pNext; pLvl=pLvl->pNext);
pSeg = (pLvl->nRight==0 ? &pLvl->lhs : &pLvl->aRhs[pLvl->nRight-1]);
rc = lsmFsSegmentContainsPg(pDb->pFS, pSeg, iPg, &bUse);
if( bUse==0 ){
pSeg = 0;
}
}
/* iPg is a real page number (not subject to redirection). So it is safe
** to pass a NULL in place of the segment pointer as the second argument
** to lsmFsDbPageGet() here. */
if( rc==LSM_OK ){
rc = lsmFsDbPageGet(pDb->pFS, 0, iPg, &pPg);
}
if( rc==LSM_OK ){
LsmBlob blob = {0, 0, 0, 0};
int nKeyWidth = 0;
LsmString str;
int nRec;
LsmPgno iPtr;
int flags2;
int iCell;
u8 *aData; int nData; /* Page data and size thereof */
aData = fsPageData(pPg, &nData);
nRec = pageGetNRec(aData, nData);
iPtr = pageGetPtr(aData, nData);
flags2 = pageGetFlags(aData, nData);
lsmStringInit(&str, pDb->pEnv);
lsmStringAppendf(&str, "Page : %lld (%d bytes)\n", iPg, nData);
lsmStringAppendf(&str, "nRec : %d\n", nRec);
lsmStringAppendf(&str, "iPtr : %lld\n", iPtr);
lsmStringAppendf(&str, "flags: %04x\n", flags2);
lsmStringAppendf(&str, "\n");
for(iCell=0; iCell<nRec; iCell++){
int nKey;
infoCellDump(
pDb, pSeg, bIndirect, pPg, iCell, 0, 0, 0, &nKey, 0, 0, &blob
);
if( nKey>nKeyWidth ) nKeyWidth = nKey;
}
if( bHex ) nKeyWidth = nKeyWidth * 2;
for(iCell=0; iCell<nRec; iCell++){
u8 *aKey; int nKey = 0; /* Key */
u8 *aVal; int nVal = 0; /* Value */
int iPgPtr;
int eType;
LsmPgno iAbsPtr;
char zFlags[8];
infoCellDump(pDb, pSeg, bIndirect, pPg, iCell, &eType, &iPgPtr,
&aKey, &nKey, &aVal, &nVal, &blob
);
iAbsPtr = iPgPtr + ((flags2 & SEGMENT_BTREE_FLAG) ? 0 : iPtr);
lsmFlagsToString(eType, zFlags);
lsmStringAppendf(&str, "%s %d (%s) ",
zFlags, iAbsPtr, (rtTopic(eType) ? "sys" : "usr")
);
infoAppendBlob(&str, bHex, aKey, nKey);
if( nVal>0 && bValues ){
lsmStringAppendf(&str, "%*s", nKeyWidth - (nKey*(1+bHex)), "");
lsmStringAppendf(&str, " ");
infoAppendBlob(&str, bHex, aVal, nVal);
}
if( rtTopic(eType) ){
int iBlk = (int)~lsmGetU32(aKey);
lsmStringAppendf(&str, " (block=%d", iBlk);
if( nVal>0 ){
i64 iSnap = lsmGetU64(aVal);
lsmStringAppendf(&str, " snapshot=%lld", iSnap);
}
lsmStringAppendf(&str, ")");
}
lsmStringAppendf(&str, "\n");
}
if( bData ){
lsmStringAppendf(&str, "\n-------------------"
"-------------------------------------------------------------\n");
lsmStringAppendf(&str, "Page %d\n",
iPg, (iPg-1)*nData, iPg*nData - 1);
for(i=0; i<nData; i += perLine){
lsmStringAppendf(&str, "%04x: ", i);
for(j=0; j<perLine; j++){
if( i+j>nData ){
lsmStringAppendf(&str, " ");
}else{
lsmStringAppendf(&str, "%02x ", aData[i+j]);
}
}
lsmStringAppendf(&str, " ");
for(j=0; j<perLine; j++){
if( i+j>nData ){
lsmStringAppendf(&str, " ");
}else{
lsmStringAppendf(&str,"%c", isprint(aData[i+j]) ? aData[i+j] : '.');
}
}
lsmStringAppendf(&str,"\n");
}
}
*pzOut = str.z;
sortedBlobFree(&blob);
lsmFsPageRelease(pPg);
}
return rc;
}
int lsmInfoPageDump(
lsm_db *pDb, /* Database handle */
LsmPgno iPg, /* Page number of page to dump */
int bHex, /* True to output key/value in hex form */
char **pzOut /* OUT: lsmMalloc'd string */
){
int flags = INFO_PAGE_DUMP_DATA | INFO_PAGE_DUMP_VALUES;
if( bHex ) flags |= INFO_PAGE_DUMP_HEX;
return infoPageDump(pDb, iPg, flags, pzOut);
}
void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){
assert( pDb->xLog );
if( pRun && pRun->iFirst ){
int flags = (bVals ? INFO_PAGE_DUMP_VALUES : 0);
char *zSeg;
Page *pPg;
zSeg = segToString(pDb->pEnv, pRun, 0);
lsmLogMessage(pDb, LSM_OK, "Segment: %s", zSeg);
lsmFree(pDb->pEnv, zSeg);
lsmFsDbPageGet(pDb->pFS, pRun, pRun->iFirst, &pPg);
while( pPg ){
Page *pNext;
char *z = 0;
infoPageDump(pDb, lsmFsPageNumber(pPg), flags, &z);
lsmLogMessage(pDb, LSM_OK, "%s", z);
lsmFree(pDb->pEnv, z);
#if 0
sortedDumpPage(pDb, pRun, pPg, bVals);
#endif
lsmFsDbPageNext(pRun, pPg, 1, &pNext);
lsmFsPageRelease(pPg);
pPg = pNext;
}
}
}
/*
** Invoke the log callback zero or more times with messages that describe
** the current database structure.
*/
void lsmSortedDumpStructure(
lsm_db *pDb, /* Database handle (used for xLog callback) */
Snapshot *pSnap, /* Snapshot to dump */
int bKeys, /* Output the keys from each segment */
int bVals, /* Output the values from each segment */
const char *zWhy /* Caption to print near top of dump */
){
Snapshot *pDump = pSnap;
Level *pTopLevel;
char *zFree = 0;
assert( pSnap );
pTopLevel = lsmDbSnapshotLevel(pDump);
if( pDb->xLog && pTopLevel ){
static int nCall = 0;
Level *pLevel;
int iLevel = 0;
nCall++;
lsmLogMessage(pDb, LSM_OK, "Database structure %d (%s)", nCall, zWhy);
#if 0
if( nCall==1031 || nCall==1032 ) bKeys=1;
#endif
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
char zLeft[1024];
char zRight[1024];
int i = 0;
Segment *aLeft[24];
Segment *aRight[24];
int nLeft = 0;
int nRight = 0;
Segment *pSeg = &pLevel->lhs;
aLeft[nLeft++] = pSeg;
for(i=0; i<pLevel->nRight; i++){
aRight[nRight++] = &pLevel->aRhs[i];
}
#ifdef LSM_LOG_FREELIST
if( nRight ){
memmove(&aRight[1], aRight, sizeof(aRight[0])*nRight);
aRight[0] = 0;
nRight++;
}
#endif
for(i=0; i<nLeft || i<nRight; i++){
int iPad = 0;
char zLevel[32];
zLeft[0] = '\0';
zRight[0] = '\0';
if( i<nLeft ){
fileToString(pDb, zLeft, sizeof(zLeft), 24, aLeft[i]);
}
if( i<nRight ){
fileToString(pDb, zRight, sizeof(zRight), 24, aRight[i]);
}
if( i==0 ){
snprintf(zLevel, sizeof(zLevel), "L%d: (age=%d) (flags=%.4x)",
iLevel, (int)pLevel->iAge, (int)pLevel->flags
);
}else{
zLevel[0] = '\0';
}
if( nRight==0 ){
iPad = 10;
}
lsmLogMessage(pDb, LSM_OK, "% 25s % *s% -35s %s",
zLevel, iPad, "", zLeft, zRight
);
}
iLevel++;
}
if( bKeys ){
for(pLevel=pTopLevel; pLevel; pLevel=pLevel->pNext){
int i;
sortedDumpSegment(pDb, &pLevel->lhs, bVals);
for(i=0; i<pLevel->nRight; i++){
sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals);
}
}
}
}
lsmInfoFreelist(pDb, &zFree);
lsmLogMessage(pDb, LSM_OK, "Freelist: %s", zFree);
lsmFree(pDb->pEnv, zFree);
assert( lsmFsIntegrityCheck(pDb) );
}
void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){
Level *pNext;
Level *p;
for(p=pLevel; p; p=pNext){
pNext = p->pNext;
sortedFreeLevel(pEnv, p);
}
}
void lsmSortedSaveTreeCursors(lsm_db *pDb){
MultiCursor *pCsr;
for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){
lsmTreeCursorSave(pCsr->apTreeCsr[0]);
lsmTreeCursorSave(pCsr->apTreeCsr[1]);
}
}
void lsmSortedExpandBtreePage(Page *pPg, int nOrig){
u8 *aData;
int nData;
int nEntry;
int iHdr;
aData = lsmFsPageData(pPg, &nData);
nEntry = pageGetNRec(aData, nOrig);
iHdr = SEGMENT_EOF(nOrig, nEntry);
memmove(&aData[iHdr + (nData-nOrig)], &aData[iHdr], nOrig-iHdr);
}
#ifdef LSM_DEBUG_EXPENSIVE
static void assertRunInOrder(lsm_db *pDb, Segment *pSeg){
Page *pPg = 0;
LsmBlob blob1 = {0, 0, 0, 0};
LsmBlob blob2 = {0, 0, 0, 0};
lsmFsDbPageGet(pDb->pFS, pSeg, pSeg->iFirst, &pPg);
while( pPg ){
u8 *aData; int nData;
Page *pNext;
aData = lsmFsPageData(pPg, &nData);
if( 0==(pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG) ){
int i;
int nRec = pageGetNRec(aData, nData);
for(i=0; i<nRec; i++){
int iTopic1, iTopic2;
pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i, &iTopic1, &blob1);
if( i==0 && blob2.nData ){
assert( sortedKeyCompare(
pDb->xCmp, iTopic2, blob2.pData, blob2.nData,
iTopic1, blob1.pData, blob1.nData
)<0 );
}
if( i<(nRec-1) ){
pageGetKeyCopy(pDb->pEnv, pSeg, pPg, i+1, &iTopic2, &blob2);
assert( sortedKeyCompare(
pDb->xCmp, iTopic1, blob1.pData, blob1.nData,
iTopic2, blob2.pData, blob2.nData
)<0 );
}
}
}
lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
lsmFsPageRelease(pPg);
pPg = pNext;
}
sortedBlobFree(&blob1);
sortedBlobFree(&blob2);
}
#endif
#ifdef LSM_DEBUG_EXPENSIVE
/*
** This function is only included in the build if LSM_DEBUG_EXPENSIVE is
** defined. Its only purpose is to evaluate various assert() statements to
** verify that the database is well formed in certain respects.
**
** More specifically, it checks that the array pOne contains the required
** pointers to pTwo. Array pTwo must be a main array. pOne may be either a
** separators array or another main array. If pOne does not contain the
** correct set of pointers, an assert() statement fails.
*/
static int assertPointersOk(
lsm_db *pDb, /* Database handle */
Segment *pOne, /* Segment containing pointers */
Segment *pTwo, /* Segment containing pointer targets */
int bRhs /* True if pTwo may have been Gobble()d */
){
int rc = LSM_OK; /* Error code */
SegmentPtr ptr1; /* Iterates through pOne */
SegmentPtr ptr2; /* Iterates through pTwo */
LsmPgno iPrev;
assert( pOne && pTwo );
memset(&ptr1, 0, sizeof(ptr1));
memset(&ptr2, 0, sizeof(ptr1));
ptr1.pSeg = pOne;
ptr2.pSeg = pTwo;
segmentPtrEndPage(pDb->pFS, &ptr1, 0, &rc);
segmentPtrEndPage(pDb->pFS, &ptr2, 0, &rc);
/* Check that the footer pointer of the first page of pOne points to
** the first page of pTwo. */
iPrev = pTwo->iFirst;
if( ptr1.iPtr!=iPrev && !bRhs ){
assert( 0 );
}
if( rc==LSM_OK && ptr1.nCell>0 ){
rc = segmentPtrLoadCell(&ptr1, 0);
}
while( rc==LSM_OK && ptr2.pPg ){
LsmPgno iThis;
/* Advance to the next page of segment pTwo that contains at least
** one cell. Break out of the loop if the iterator reaches EOF. */
do{
rc = segmentPtrNextPage(&ptr2, 1);
assert( rc==LSM_OK );
}while( rc==LSM_OK && ptr2.pPg && ptr2.nCell==0 );
if( rc!=LSM_OK || ptr2.pPg==0 ) break;
iThis = lsmFsPageNumber(ptr2.pPg);
if( (ptr2.flags & (PGFTR_SKIP_THIS_FLAG|SEGMENT_BTREE_FLAG))==0 ){
/* Load the first cell in the array pTwo page. */
rc = segmentPtrLoadCell(&ptr2, 0);
/* Iterate forwards through pOne, searching for a key that matches the
** key ptr2.pKey/nKey. This key should have a pointer to the page that
** ptr2 currently points to. */
while( rc==LSM_OK ){
int res = rtTopic(ptr1.eType) - rtTopic(ptr2.eType);
if( res==0 ){
res = pDb->xCmp(ptr1.pKey, ptr1.nKey, ptr2.pKey, ptr2.nKey);
}
if( res<0 ){
assert( bRhs || ptr1.iPtr+ptr1.iPgPtr==iPrev );
}else if( res>0 ){
assert( 0 );
}else{
assert( ptr1.iPtr+ptr1.iPgPtr==iThis );
iPrev = iThis;
break;
}
rc = segmentPtrAdvance(0, &ptr1, 0);
if( ptr1.pPg==0 ){
assert( 0 );
}
}
}
}
segmentPtrReset(&ptr1, 0);
segmentPtrReset(&ptr2, 0);
return LSM_OK;
}
/*
** This function is only included in the build if LSM_DEBUG_EXPENSIVE is
** defined. Its only purpose is to evaluate various assert() statements to
** verify that the database is well formed in certain respects.
**
** More specifically, it checks that the b-tree embedded in array pRun
** contains the correct keys. If not, an assert() fails.
*/
static int assertBtreeOk(
lsm_db *pDb,
Segment *pSeg
){
int rc = LSM_OK; /* Return code */
if( pSeg->iRoot ){
LsmBlob blob = {0, 0, 0}; /* Buffer used to cache overflow keys */
FileSystem *pFS = pDb->pFS; /* File system to read from */
Page *pPg = 0; /* Main run page */
BtreeCursor *pCsr = 0; /* Btree cursor */
rc = btreeCursorNew(pDb, pSeg, &pCsr);
if( rc==LSM_OK ){
rc = btreeCursorFirst(pCsr);
}
if( rc==LSM_OK ){
rc = lsmFsDbPageGet(pFS, pSeg, pSeg->iFirst, &pPg);
}
while( rc==LSM_OK ){
Page *pNext;
u8 *aData;
int nData;
int flags;
rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
lsmFsPageRelease(pPg);
pPg = pNext;
if( pPg==0 ) break;
aData = fsPageData(pPg, &nData);
flags = pageGetFlags(aData, nData);
if( rc==LSM_OK
&& 0==((SEGMENT_BTREE_FLAG|PGFTR_SKIP_THIS_FLAG) & flags)
&& 0!=pageGetNRec(aData, nData)
){
u8 *pKey;
int nKey;
int iTopic;
pKey = pageGetKey(pSeg, pPg, 0, &iTopic, &nKey, &blob);
assert( nKey==pCsr->nKey && 0==memcmp(pKey, pCsr->pKey, nKey) );
assert( lsmFsPageNumber(pPg)==pCsr->iPtr );
rc = btreeCursorNext(pCsr);
}
}
assert( rc!=LSM_OK || pCsr->pKey==0 );
if( pPg ) lsmFsPageRelease(pPg);
btreeCursorFree(pCsr);
sortedBlobFree(&blob);
}
return rc;
}
#endif /* ifdef LSM_DEBUG_EXPENSIVE */