Change the btree balance code so that it does not call balance_nonroot() recursively. (CVS 6729)

FossilOrigin-Name: 7863db904d6fc36417c923e3d135eb2c145b9013
This commit is contained in:
danielk1977 2009-06-08 14:49:45 +00:00
parent b054068d21
commit a50d9aa784
3 changed files with 367 additions and 340 deletions

View File

@ -1,5 +1,5 @@
C Increase\sthe\sversion\snumber\sto\s3.6.15\sin\spreparation\sfor\sthe\snext\srelease.\s(CVS\s6728)
D 2009-06-08T12:52:48
C Change\sthe\sbtree\sbalance\scode\sso\sthat\sit\sdoes\snot\scall\sbalance_nonroot()\srecursively.\s(CVS\s6729)
D 2009-06-08T14:49:46
F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0
F Makefile.in 8b8fb7823264331210cddf103831816c286ba446
F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
@ -106,7 +106,7 @@ F src/auth.c 98db07c2088455797678eb1031f42d4d94d18a71
F src/backup.c ff50af53184a5fd7bdee4d620b5dabef74717c79
F src/bitvec.c 0ef0651714728055d43de7a4cdd95e703fac0119
F src/btmutex.c 9b899c0d8df3bd68f527b0afe03088321b696d3c
F src/btree.c bb9dbeaf781544314e181ffe81275a7db31acb30
F src/btree.c 50716b7a8bd32b9101bd3238ed7bd61fafe5cd67
F src/btree.h f70b694e8c163227369a66863b01fbff9009f323
F src/btreeInt.h df64030d632f8c8ac217ed52e8b6b3eacacb33a5
F src/build.c 20e02fd72249159ff6829009f3029d16d59cdff5
@ -733,7 +733,7 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff
F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224
F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e
F tool/vdbe-compress.tcl 672f81d693a03f80f5ae60bfefacd8a349e76746
P a255c645c46ae03b65f862858fe57e462076e1fc
R 8fb9b7e59a24bac8e4bf17a7db6d3e15
U drh
Z bb4a57ffdcc392156e0e6c8c5c319c52
P 456ea541d67221a573b63e264c83f4448f2ed569
R 7ada04c9a78d367df09e91faed8b31d8
U danielk1977
Z 47cd44a7ec9eea431d4519541126ee12

View File

@ -1 +1 @@
456ea541d67221a573b63e264c83f4448f2ed569
7863db904d6fc36417c923e3d135eb2c145b9013

View File

@ -9,7 +9,7 @@
** May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.619 2009/06/05 18:44:15 drh Exp $
** $Id: btree.c,v 1.620 2009/06/08 14:49:46 danielk1977 Exp $
**
** This file implements a external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
@ -28,7 +28,7 @@ static const char zMagicHeader[] = SQLITE_FILE_HEADER;
** macro.
*/
#if 0
int sqlite3BtreeTrace=0; /* True to enable tracing */
int sqlite3BtreeTrace=1; /* True to enable tracing */
# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
#else
# define TRACE(X)
@ -2709,7 +2709,9 @@ static int autoVacuumCommit(BtShared *pBt){
return rc;
}
#endif /* ifndef SQLITE_OMIT_AUTOVACUUM */
#else /* ifndef SQLITE_OMIT_AUTOVACUUM */
# define setChildPtrmaps(x) SQLITE_OK
#endif
/*
** This routine does the first phase of a two-phase commit. This routine
@ -4649,6 +4651,8 @@ end_allocate_page:
return SQLITE_CORRUPT_BKPT;
}
(*ppPage)->isInit = 0;
}else{
*ppPage = 0;
}
return rc;
}
@ -5196,8 +5200,6 @@ static void assemblePage(
#define NN 1 /* Number of neighbors on either side of pPage */
#define NB (NN*2+1) /* Total pages involved in the balance */
/* Forward reference */
static int balance(BtCursor*, int);
#ifndef SQLITE_OMIT_QUICKBALANCE
/*
@ -5216,35 +5218,37 @@ static int balance(BtCursor*, int);
** pPage is the leaf page which is the right-most page in the tree.
** pParent is its parent. pPage must have a single overflow entry
** which is also the right-most entry on the page.
**
** The pSpace buffer is used to store a temporary copy of the divider
** cell that will be inserted into pParent. Such a cell consists of a 4
** byte page number followed by a variable length integer. In other
** words, at most 13 bytes. Hence the pSpace buffer must be at
** least 13 bytes in size.
*/
static int balance_quick(BtCursor *pCur){
MemPage *const pPage = pCur->apPage[pCur->iPage];
BtShared *const pBt = pCur->pBt;
static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
BtShared *const pBt = pPage->pBt; /* B-Tree Database */
MemPage *pNew = 0; /* Newly allocated page */
int rc; /* Return Code */
Pgno pgnoNew; /* Page number of pNew */
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
/* Allocate a new page. This page will become the right-sibling of pPage */
/* Allocate a new page. This page will become the right-sibling of
** pPage. Make the parent page writable, so that the new divider cell
** may be inserted. If both these operations are successful, proceed.
*/
rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
if( rc==SQLITE_OK ){
/* The parentCell buffer is used to store a temporary copy of the divider
** cell that will be inserted into pParent. Such a cell consists of a 4
** byte page number followed by a variable length integer. In other
** words, at most 13 bytes. Hence the parentCell buffer must be at
** least 13 bytes in size.
*/
MemPage * const pParent = pCur->apPage[pCur->iPage-1];
u8 parentCell[13];
u8 *pOut = &parentCell[4];
u8 *pOut = &pSpace[4];
u8 *pCell = pPage->aOvfl[0].pCell;
u16 szCell = cellSizePtr(pPage, pCell);
u8 *pStop;
assert( sqlite3PagerIswriteable(pNew->pDbPage) );
zeroPage(pNew, pPage->aData[0]);
assemblePage(pNew, 1, &pCell, &szCell);
pPage->nOverflow = 0;
@ -5260,9 +5264,9 @@ static int balance_quick(BtCursor *pCur){
** and the key value (a variable length integer, may have any value).
** The first of the while(...) loops below skips over the record-length
** field. The second while(...) loop copies the key value from the
** cell on pPage into the parentCell buffer.
** cell on pPage into the pSpace buffer.
*/
put4byte(parentCell, pPage->pgno);
put4byte(pSpace, pPage->pgno);
pCell = findCell(pPage, pPage->nCell-1);
pStop = &pCell[9];
while( (*(pCell++)&0x80) && pCell<pStop );
@ -5270,7 +5274,7 @@ static int balance_quick(BtCursor *pCur){
while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
/* Insert the new divider cell into pParent */
insertCell(pParent, pParent->nCell, parentCell, pOut-parentCell, 0, 0);
insertCell(pParent, pParent->nCell, pSpace, pOut-pSpace, 0, 0);
/* Set the right-child pointer of pParent to point to the new page. */
put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
@ -5288,33 +5292,25 @@ static int balance_quick(BtCursor *pCur){
/* Release the reference to the new page. */
releasePage(pNew);
/* At this point the pPage->nFree variable is not set correctly with
** respect to the content of the page (because it was set to 0 by
** insertCell). So call sqlite3BtreeInitPage() to make sure it is
** correct.
**
** This has to be done even if an error will be returned. Normally, if
** an error occurs during tree balancing, the contents of MemPage are
** not important, as they will be recalculated when the page is rolled
** back. But here, in balance_quick(), it is possible that pPage has
** not yet been marked dirty or written into the journal file. Therefore
** it will not be rolled back and so it is important to make sure that
** the page data and contents of MemPage are consistent.
*/
pPage->isInit = 0;
sqlite3BtreeInitPage(pPage);
assert( pPage->nOverflow==0 );
}
/* At this point the pPage->nFree variable is not set correctly with
** respect to the content of the page (because it was set to 0 by
** insertCell). So call sqlite3BtreeInitPage() to make sure it is
** correct.
**
** This has to be done even if an error will be returned. Normally, if
** an error occurs during tree balancing, the contents of MemPage are
** not important, as they will be recalculated when the page is rolled
** back. But here, in balance_quick(), it is possible that pPage has
** not yet been marked dirty or written into the journal file. Therefore
** it will not be rolled back and so it is important to make sure that
** the page data and contents of MemPage are consistent.
*/
pPage->isInit = 0;
sqlite3BtreeInitPage(pPage);
assert( pPage->nOverflow==0 );
/* If everything else succeeded, balance the parent page, in
** case the divider cell inserted caused it to become overfull.
*/
if( rc==SQLITE_OK ){
releasePage(pPage);
pCur->iPage--;
rc = balance(pCur, 0);
}
return rc;
}
#endif /* SQLITE_OMIT_QUICKBALANCE */
@ -5348,17 +5344,13 @@ static int balance_quick(BtCursor *pCur){
** in a corrupted state. So if this routine fails, the database should
** be rolled back.
*/
static int balance_nonroot(BtCursor *pCur){
MemPage *pPage; /* The over or underfull page to balance */
MemPage *pParent; /* The parent of pPage */
static int balance_nonroot(MemPage *pParent, int iParentIdx, u8 *aSpace2){
BtShared *pBt; /* The whole database */
int nCell = 0; /* Number of cells in apCell[] */
int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
int nOld = 0; /* Number of pages in apOld[] */
int nNew = 0; /* Number of pages in apNew[] */
int nDiv; /* Number of cells in apDiv[] */
int i, j, k; /* Loop counters */
int idx; /* Index of pPage in pParent->aCell[] */
int nxDiv; /* Next divider slot in pParent->aCell[] */
int rc; /* The return code */
int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
@ -5381,84 +5373,30 @@ static int balance_nonroot(BtCursor *pCur){
u16 *szCell; /* Local size of all cells in apCell[] */
u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
u8 *aSpace1; /* Space for copies of dividers cells before balance */
u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
u8 *aFrom = 0;
pPage = pCur->apPage[pCur->iPage];
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
VVA_ONLY( pCur->pagesShuffled = 1 );
/*
** Find the parent page.
*/
assert( pCur->iPage>0 );
assert( pPage->isInit );
assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
pBt = pPage->pBt;
pParent = pCur->apPage[pCur->iPage-1];
assert( pParent );
if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
goto balance_cleanup;
}
pBt = pParent->pBt;
assert( sqlite3_mutex_held(pBt->mutex) );
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
#ifndef SQLITE_OMIT_QUICKBALANCE
/*
** A special case: If a new entry has just been inserted into a
** table (that is, a btree with integer keys and all data at the leaves)
** and the new entry is the right-most entry in the tree (it has the
** largest key) then use the special balance_quick() routine for
** balancing. balance_quick() is much faster and results in a tighter
** packing of data in the common case.
*/
if( pPage->leaf &&
pPage->intKey &&
pPage->nOverflow==1 &&
pPage->aOvfl[0].idx==pPage->nCell &&
pParent->pgno!=1 &&
get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
){
assert( pPage->intKey );
/*
** TODO: Check the siblings to the left of pPage. It may be that
** they are not full and no new page is required.
*/
return balance_quick(pCur);
}
#endif
if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
goto balance_cleanup;
}
/*
** Find the cell in the parent page whose left child points back
** to pPage. The "idx" variable is the index of that cell. If pPage
** is the rightmost child of pParent then set idx to pParent->nCell
*/
idx = pCur->aiIdx[pCur->iPage-1];
assertParentIndex(pParent, idx, pPage->pgno);
/*
** Find sibling pages to pPage and the cells in pParent that divide
** the siblings. An attempt is made to find NN siblings on either
** side of pPage. More siblings are taken from one side, however, if
** pPage there are fewer than NN siblings on the other side. If pParent
/* Find the sibling pages to balance. Also locate the cells in pParent
** that divide the siblings. An attempt is made to find NN siblings on
** either side of pPage. More siblings are taken from one side, however,
** if there are fewer than NN siblings on the other side. If pParent
** has NB or fewer children then all children of pParent are taken.
*/
nxDiv = idx - NN;
nxDiv = iParentIdx - NN;
if( nxDiv + NB > pParent->nCell ){
nxDiv = pParent->nCell - NB + 1;
}
if( nxDiv<0 ){
nxDiv = 0;
}
nDiv = 0;
for(i=0, k=nxDiv; i<NB; i++, k++){
if( k<pParent->nCell ){
apDiv[i] = findCell(pParent, k);
nDiv++;
assert( !pParent->leaf );
pgnoOld[i] = get4byte(apDiv[i]);
}else if( k==pParent->nCell ){
@ -5468,7 +5406,6 @@ static int balance_nonroot(BtCursor *pCur){
}
rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
if( rc ) goto balance_cleanup;
/* apOld[i]->idxParent = k; */
apCopy[i] = 0;
assert( i==nOld );
nOld++;
@ -5505,7 +5442,7 @@ static int balance_nonroot(BtCursor *pCur){
if( ISAUTOVACUUM ){
aFrom = &aSpace1[pBt->pageSize];
}
aSpace2 = sqlite3PageMalloc(pBt->pageSize);
/* aSpace2 = sqlite3PageMalloc(pBt->pageSize); */
if( aSpace2==0 ){
rc = SQLITE_NOMEM;
goto balance_cleanup;
@ -5541,8 +5478,8 @@ static int balance_nonroot(BtCursor *pCur){
** leafData: 1 if pPage holds key+data and pParent holds only keys.
*/
nCell = 0;
leafCorrection = pPage->leaf*4;
leafData = pPage->hasData;
leafCorrection = apOld[0]->leaf*4;
leafData = apOld[0]->hasData;
for(i=0; i<nOld; i++){
MemPage *pOld = apCopy[i];
int limit = pOld->nCell+pOld->nOverflow;
@ -5677,8 +5614,8 @@ static int balance_nonroot(BtCursor *pCur){
/*
** Allocate k new pages. Reuse old pages where possible.
*/
assert( pPage->pgno>1 );
pageFlags = pPage->aData[0];
assert( apOld[0]->pgno>1 );
pageFlags = apOld[0]->aData[0];
for(i=0; i<k; i++){
MemPage *pNew;
if( i<nOld ){
@ -5906,16 +5843,11 @@ static int balance_nonroot(BtCursor *pCur){
apCell = 0;
TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
pPage->pgno, nOld, nNew, nCell));
pPage->nOverflow = 0;
releasePage(pPage);
pCur->iPage--;
rc = balance(pCur, 0);
/*
** Cleanup before returning.
*/
balance_cleanup:
sqlite3PageFree(aSpace2);
sqlite3ScratchFree(apCell);
for(i=0; i<nOld; i++){
releasePage(apOld[i]);
@ -5923,216 +5855,315 @@ balance_cleanup:
for(i=0; i<nNew; i++){
releasePage(apNew[i]);
}
pCur->apPage[pCur->iPage]->nOverflow = 0;
return rc;
}
/*
** This routine is called for the root page of a btree when the root
** page contains no cells. This is an opportunity to make the tree
** This function is used to copy the contents of the b-tree node stored
** on page pFrom to page pTo. If page pFrom was not a leaf page, then
** the pointer-map entries for each child page are updated so that the
** parent page stored in the pointer map is page pTo. If pFrom contained
** any cells with overflow page pointers, then the corresponding pointer
** map entries are also updated so that the parent page is page pTo.
**
** If pFrom is currently carrying any overflow cells (entries in the
** MemPage.aOvfl[] array), they are not copied to pTo.
**
** Before returning, page pTo is reinitialized using sqlite3BtreeInitPage().
**
** The performance of this function is not critical. It is only used by
** the balance_shallower() and balance_deeper() procedures, neither of
** which are called often under normal circumstances.
*/
static int copyNodeContent(MemPage *pFrom, MemPage *pTo){
BtShared * const pBt = pFrom->pBt;
u8 * const aFrom = pFrom->aData;
u8 * const aTo = pTo->aData;
int const iFromHdr = pFrom->hdrOffset;
int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
int rc = SQLITE_OK;
int iData;
assert( pFrom->isInit );
assert( pFrom->nFree>=iToHdr );
assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize );
/* Copy the b-tree node content from page pFrom to page pTo. */
iData = get2byte(&aFrom[iFromHdr+5]);
memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
/* Reinitialize page pTo so that the contents of the MemPage structure
** match the new data. The initialization of pTo "cannot" fail, as the
** data copied from pFrom is known to be valid. */
pTo->isInit = 0;
TESTONLY(rc = ) sqlite3BtreeInitPage(pTo);
assert( rc==SQLITE_OK );
/* If this is an auto-vacuum database, update the pointer-map entries
** for any b-tree or overflow pages that pTo now contains the pointers to. */
if( ISAUTOVACUUM ){
rc = setChildPtrmaps(pTo);
}
return rc;
}
/*
** This routine is called on the root page of a btree when the root
** page contains no cells. This is an opportunity to make the tree
** shallower by one level.
*/
static int balance_shallower(BtCursor *pCur){
MemPage *pPage; /* Root page of B-Tree */
MemPage *pChild; /* The only child page of pPage */
Pgno pgnoChild; /* Page number for pChild */
int rc = SQLITE_OK; /* Return code from subprocedures */
BtShared *pBt; /* The main BTree structure */
int mxCellPerPage; /* Maximum number of cells per page */
u8 **apCell; /* All cells from pages being balanced */
u16 *szCell; /* Local size of all cells */
static int balance_shallower(MemPage *pRoot){
assert( pCur->iPage==0 );
pPage = pCur->apPage[0];
assert( pPage->nCell==0 );
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
pBt = pPage->pBt;
mxCellPerPage = MX_CELL(pBt);
apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
if( apCell==0 ) return SQLITE_NOMEM;
szCell = (u16*)&apCell[mxCellPerPage];
if( pPage->leaf ){
/* The table is completely empty */
TRACE(("BALANCE: empty table %d\n", pPage->pgno));
}else{
/* The root page is empty but has one child. Transfer the
** information from that one child into the root page if it
** will fit. This reduces the depth of the tree by one.
**
** If the root page is page 1, it has less space available than
** its child (due to the 100 byte header that occurs at the beginning
** of the database fle), so it might not be able to hold all of the
** information currently contained in the child. If this is the
** case, then do not do the transfer. Leave page 1 empty except
** for the right-pointer to the child page. The child page becomes
** the virtual root of the tree.
*/
VVA_ONLY( pCur->pagesShuffled = 1 );
pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
assert( pgnoChild>0 );
assert( pgnoChild<=pagerPagecount(pPage->pBt) );
rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
if( rc ) goto end_shallow_balance;
if( pPage->pgno==1 ){
rc = sqlite3BtreeInitPage(pChild);
if( rc ) goto end_shallow_balance;
assert( pChild->nOverflow==0 );
if( pChild->nFree>=100 ){
/* The child information will fit on the root page, so do the
** copy */
int i;
zeroPage(pPage, pChild->aData[0]);
for(i=0; i<pChild->nCell; i++){
apCell[i] = findCell(pChild,i);
szCell[i] = cellSizePtr(pChild, apCell[i]);
}
assemblePage(pPage, pChild->nCell, apCell, szCell);
/* Copy the right-pointer of the child to the parent. */
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
put4byte(&pPage->aData[pPage->hdrOffset+8],
get4byte(&pChild->aData[pChild->hdrOffset+8]));
/* The root page is empty but has one child. Transfer the
** information from that one child into the root page if it
** will fit. This reduces the depth of the tree by one.
**
** If the root page is page 1, it has less space available than
** its child (due to the 100 byte header that occurs at the beginning
** of the database fle), so it might not be able to hold all of the
** information currently contained in the child. If this is the
** case, then do not do the transfer. Leave page 1 empty except
** for the right-pointer to the child page. The child page becomes
** the virtual root of the tree.
*/
int rc = SQLITE_OK; /* Return code */
int const hdr = pRoot->hdrOffset; /* Offset of root page header */
MemPage *pChild; /* Only child of pRoot */
Pgno const pgnoChild = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
assert( pRoot->nCell==0 );
assert( sqlite3_mutex_held(pRoot->pBt->mutex) );
assert( !pRoot->leaf );
assert( pgnoChild>0 );
assert( pgnoChild<=pagerPagecount(pRoot->pBt) );
assert( hdr==0 || pRoot->pgno==1 );
rc = sqlite3BtreeGetPage(pRoot->pBt, pgnoChild, &pChild, 0);
if( rc==SQLITE_OK ){
if( pChild->nFree>=hdr ){
if( hdr ){
rc = defragmentPage(pChild);
}
if( rc==SQLITE_OK ){
rc = copyNodeContent(pChild, pRoot);
}
if( rc==SQLITE_OK ){
rc = freePage(pChild);
TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
}else{
/* The child has more information that will fit on the root.
** The tree is already balanced. Do nothing. */
TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
}
}else{
memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
pPage->isInit = 0;
rc = sqlite3BtreeInitPage(pPage);
assert( rc==SQLITE_OK );
freePage(pChild);
TRACE(("BALANCE: transfer child %d into root %d\n",
pChild->pgno, pPage->pgno));
/* The child has more information that will fit on the root.
** The tree is already balanced. Do nothing. */
TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
}
assert( pPage->nOverflow==0 );
#ifndef SQLITE_OMIT_AUTOVACUUM
if( ISAUTOVACUUM && rc==SQLITE_OK ){
rc = setChildPtrmaps(pPage);
}
#endif
releasePage(pChild);
}
end_shallow_balance:
sqlite3_free(apCell);
return rc;
}
/*
** The root page is overfull
** This function is called when the root page of a b-tree structure is
** overfull (has one or more overflow pages).
**
** When this happens, Create a new child page and copy the
** contents of the root into the child. Then make the root
** page an empty page with rightChild pointing to the new
** child. Finally, call balance_internal() on the new child
** to cause it to split.
** A new child page is allocated and the contents of the current root
** page, including overflow cells, are copied into the child. The root
** page is then overwritten to make it an empty page with the right-child
** pointer pointing to the new page.
**
** Before returning, all pointer-map entries corresponding to pages
** that the new child-page now contains pointers to are updated. The
** entry corresponding to the new right-child pointer of the root
** page is also updated.
**
** If successful, *ppChild is set to contain a reference to the child
** page and SQLITE_OK is returned. In this case the caller is required
** to call releasePage() on *ppChild exactly once. If an error occurs,
** an error code is returned and *ppChild is set to 0.
*/
static int balance_deeper(BtCursor *pCur){
int rc; /* Return value from subprocedures */
MemPage *pPage; /* Pointer to the root page */
MemPage *pChild; /* Pointer to a new child page */
Pgno pgnoChild; /* Page number of the new child page */
BtShared *pBt; /* The BTree */
int usableSize; /* Total usable size of a page */
u8 *data; /* Content of the parent page */
u8 *cdata; /* Content of the child page */
int hdr; /* Offset to page header in parent */
int cbrk; /* Offset to content of first cell in parent */
static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
int rc; /* Return value from subprocedures */
MemPage *pChild = 0; /* Pointer to a new child page */
Pgno pgnoChild; /* Page number of the new child page */
BtShared *pBt = pRoot->pBt; /* The BTree */
assert( pCur->iPage==0 );
assert( pCur->apPage[0]->nOverflow>0 );
VVA_ONLY( pCur->pagesShuffled = 1 );
pPage = pCur->apPage[0];
pBt = pPage->pBt;
assert( pRoot->nOverflow>0 );
assert( sqlite3_mutex_held(pBt->mutex) );
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
if( rc ) return rc;
assert( sqlite3PagerIswriteable(pChild->pDbPage) );
usableSize = pBt->usableSize;
data = pPage->aData;
hdr = pPage->hdrOffset;
cbrk = get2byte(&data[hdr+5]);
cdata = pChild->aData;
memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
assert( pChild->isInit==0 );
rc = sqlite3BtreeInitPage(pChild);
if( rc==SQLITE_OK ){
int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
pChild->nOverflow = pPage->nOverflow;
if( pChild->nOverflow ){
pChild->nFree = 0;
}
assert( pChild->nCell==pPage->nCell );
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
if( ISAUTOVACUUM ){
rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
#ifndef SQLITE_OMIT_AUTOVACUUM
if( rc==SQLITE_OK ){
rc = setChildPtrmaps(pChild);
}
if( rc ){
pChild->nOverflow = 0;
}
#endif
}
}
if( rc==SQLITE_OK ){
pCur->iPage++;
pCur->apPage[1] = pChild;
pCur->aiIdx[0] = 0;
rc = balance_nonroot(pCur);
}else{
/* Make pRoot, the root page of the b-tree, writable. Allocate a new
** page that will become the new right-child of pPage. Copy the contents
** of the node stored on pRoot into the new child page.
*/
if( SQLITE_OK!=(rc = sqlite3PagerWrite(pRoot->pDbPage))
|| SQLITE_OK!=(rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0))
|| SQLITE_OK!=(rc = copyNodeContent(pRoot, pChild))
|| (ISAUTOVACUUM &&
SQLITE_OK!=(rc = ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno)))
){
*ppChild = 0;
releasePage(pChild);
return rc;
}
assert( sqlite3PagerIswriteable(pChild->pDbPage) );
assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
assert( pChild->nCell==pRoot->nCell );
return rc;
TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
/* Copy the overflow cells from pRoot to pChild */
memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0]));
pChild->nOverflow = pRoot->nOverflow;
pChild->nFree = 0;
/* Zero the contents of pRoot. Then install pChild as the right-child. */
zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
*ppChild = pChild;
return SQLITE_OK;
}
/*
** The page that pCur currently points to has just been modified in
** some way. This function figures out if this modification means the
** tree needs to be balanced, and if so calls the appropriate balancing
** routine.
**
** Parameter isInsert is true if a new cell was just inserted into the
** page, or false otherwise.
** routine. Balancing routines are:
**
** balance_quick()
** balance_shallower()
** balance_deeper()
** balance_nonroot()
**
** If built with SQLITE_DEBUG, pCur->pagesShuffled is set to true if
** balance_shallower(), balance_deeper() or balance_nonroot() is called.
** If none of these functions are invoked, pCur->pagesShuffled is left
** unmodified.
*/
static int balance(BtCursor *pCur, int isInsert){
static int balance(BtCursor *pCur){
int rc = SQLITE_OK;
MemPage *pPage = pCur->apPage[pCur->iPage];
const int nMin = pCur->pBt->usableSize * 2 / 3;
u8 aBalanceQuickSpace[13];
u8 *pFree = 0;
assert( sqlite3_mutex_held(pPage->pBt->mutex) );
if( pCur->iPage==0 ){
rc = sqlite3PagerWrite(pPage->pDbPage);
if( rc==SQLITE_OK && pPage->nOverflow>0 ){
rc = balance_deeper(pCur);
assert( pCur->apPage[0]==pPage );
assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
}
if( rc==SQLITE_OK && pPage->nCell==0 ){
rc = balance_shallower(pCur);
assert( pCur->apPage[0]==pPage );
assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
}
}else{
if( pPage->nOverflow>0 ||
(!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
rc = balance_nonroot(pCur);
TESTONLY( int balance_quick_called = 0; );
TESTONLY( int balance_deeper_called = 0; );
do {
int iPage = pCur->iPage;
MemPage *pPage = pCur->apPage[iPage];
if( iPage==0 ){
if( pPage->nOverflow ){
/* The root page of the b-tree is overfull. In this case call the
** balance_deeper() function to create a new child for the root-page
** and copy the current contents of the root-page to it. The
** next iteration of the do-loop will balance the child page.
*/
assert( (balance_deeper_called++)==0 );
rc = balance_deeper(pPage, &pCur->apPage[1]);
if( rc==SQLITE_OK ){
pCur->iPage = 1;
pCur->aiIdx[0] = 0;
pCur->aiIdx[1] = 0;
assert( pCur->apPage[1]->nOverflow );
}
VVA_ONLY( pCur->pagesShuffled = 1 );
}else{
/* The root page of the b-tree is now empty. If the root-page is not
** also a leaf page, it will have a single child page. Call
** balance_shallower to attempt to copy the contents of the single
** child-page into the root page (this may not be possible if the
** root page is page 1).
**
** Whether or not this is possible , the tree is now balanced.
** Therefore is no next iteration of the do-loop.
*/
if( pPage->nCell==0 && !pPage->leaf ){
rc = balance_shallower(pPage);
VVA_ONLY( pCur->pagesShuffled = 1 );
}
break;
}
}else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
break;
}else{
MemPage * const pParent = pCur->apPage[iPage-1];
int const iIdx = pCur->aiIdx[iPage-1];
rc = sqlite3PagerWrite(pParent->pDbPage);
if( rc==SQLITE_OK ){
#ifndef SQLITE_OMIT_QUICKBALANCE
if( pPage->hasData
&& pPage->nOverflow==1
&& pPage->aOvfl[0].idx==pPage->nCell
&& pParent->pgno!=1
&& pParent->nCell==iIdx
){
/* Call balance_quick() to create a new sibling of pPage on which
** to store the overflow cell. balance_quick() inserts a new cell
** into pParent, which may cause pParent overflow. If this
** happens, the next interation of the do-loop will balance pParent
** use either balance_nonroot() or balance_deeper(). Until this
** happens, the overflow cell is stored in the aBalanceQuickSpace[]
** buffer.
**
** The purpose of the following assert() is to check that only a
** single call to balance_quick() is made for each call to this
** function. If this were not verified, a subtle bug involving reuse
** of the aBalanceQuickSpace[] might sneak in.
*/
assert( (balance_quick_called++)==0 );
rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
}else
#endif
{
/* In this case, call balance_nonroot() to redistribute cells
** between pPage and up to 2 of its sibling pages. This involves
** modifying the contents of pParent, which may cause pParent to
** become overfull or underfull. The next iteration of the do-loop
** will balance the parent page to correct this.
**
** If the parent page becomes overfull, the overflow cell or cells
** are stored in the pSpace buffer allocated immediately below.
** A subsequent iteration of the do-loop will deal with this by
** calling balance_nonroot() (balance_deeper() may be called first,
** but it doesn't deal with overflow cells - just moves them to a
** different page). Once this subsequent call to balance_nonroot()
** has completed, it is safe to release the pSpace buffer used by
** the previous call, as the overflow cell data will have been
** copied either into the body of a database page or into the new
** pSpace buffer passed to the latter call to balance_nonroot().
*/
u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
rc = balance_nonroot(pParent, iIdx, pSpace);
if( pFree ){
/* If pFree is not NULL, it points to the pSpace buffer used
** by a previous call to balance_nonroot(). Its contents are
** now stored either on real database pages or within the
** new pSpace buffer, so it may be safely freed here. */
sqlite3PageFree(pFree);
}
/* The pSpace buffer will be freed after the next call to
** balance_nonroot(), or just before this function returns, whichever
** comes first. */
pFree = pSpace;
VVA_ONLY( pCur->pagesShuffled = 1 );
}
}
pPage->nOverflow = 0;
/* The next iteration of the do-loop balances the parent page. */
releasePage(pPage);
pCur->iPage--;
}
}while( rc==SQLITE_OK );
if( pFree ){
sqlite3PageFree(pFree);
}
return rc;
}
@ -6279,6 +6310,7 @@ int sqlite3BtreeInsert(
pPage = pCur->apPage[pCur->iPage];
assert( pPage->intKey || nKey>=0 );
assert( pPage->intKey || nKey>=0 );
assert( pPage->leaf || !pPage->intKey );
TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
pCur->pgnoRoot, nKey, nData, pPage->pgno,
@ -6313,47 +6345,42 @@ int sqlite3BtreeInsert(
}else if( loc<0 && pPage->nCell>0 ){
assert( pPage->leaf );
idx = ++pCur->aiIdx[pCur->iPage];
pCur->info.nSize = 0;
pCur->validNKey = 0;
}else{
assert( pPage->leaf );
}
rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
/* If no error has occured, call balance() to deal with any overflow and
** move the cursor to point at the root of the table (since balance may
** have rearranged the table in such a way as to invalidate BtCursor.apPage[]
** or BtCursor.aiIdx[]).
/* If no error has occured and pPage has an overflow cell, call balance()
** to redistribute the cells within the tree. Since balance() may move
** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
** variables.
**
** Except, if all of the following are true, do nothing:
** Previous versions of SQLite called moveToRoot() to move the cursor
** back to the root page as balance() used to invalidate the contents
** of BtCursor.apPage[] and BtCursor.aiIdx[]. This is no longer necessary,
** as balance() always leaves the cursor pointing to a valid entry.
**
** * Inserting the new cell did not cause overflow,
**
** * Before inserting the new cell the cursor was pointing at the
** largest key in an intkey B-Tree, and
**
** * The key value associated with the new cell is now the largest
** in the B-Tree.
**
** In this case the cursor can be safely left pointing at the (new)
** largest key value in the B-Tree. Doing so speeds up inserting a set
** of entries with increasing integer key values via a single cursor
** (comes up with "INSERT INTO ... SELECT ..." statements), as
** the next insert operation is not required to seek the cursor.
** There is a subtle but important optimization here too. When inserting
** multiple records into an intkey b-tree using a single cursor (as can
** happen while processing an "INSERT INTO ... SELECT" statement), it
** is advantageous to leave the cursor pointing to the last entry in
** the b-tree if possible. If the cursor is left pointing to the last
** entry in the table, and the next row inserted has an integer key
** larger than the largest existing key, it is possible to insert the
** row without seeking the cursor. This can be a big performance boost.
*/
if( rc==SQLITE_OK
&& (pPage->nOverflow || !pCur->atLast || loc>=0 || !pCur->apPage[0]->intKey)
){
rc = balance(pCur, 1);
if( rc==SQLITE_OK ){
moveToRoot(pCur);
}
pCur->info.nSize = 0;
pCur->validNKey = 0;
if( rc==SQLITE_OK && pPage->nOverflow ){
pCur->atLast = 0;
rc = balance(pCur);
/* Must make sure nOverflow is reset to zero even if the balance()
** fails. Internal data structure corruption will result otherwise. */
pCur->apPage[pCur->iPage]->nOverflow = 0;
}
/* Must make sure nOverflow is reset to zero even if the balance()
** fails. Internal data structure corruption will result otherwise. */
pCur->apPage[pCur->iPage]->nOverflow = 0;
assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
end_insert:
return rc;
@ -6500,7 +6527,7 @@ int sqlite3BtreeDelete(BtCursor *pCur){
assert( sqlite3PagerIswriteable(pPage->pDbPage) );
put4byte(findOverflowCell(pPage, idx), pgnoChild);
VVA_ONLY( pCur->pagesShuffled = 0 );
rc = balance(pCur, 0);
rc = balance(pCur);
}
if( rc==SQLITE_OK && leafCursorInvalid ){
@ -6542,7 +6569,7 @@ int sqlite3BtreeDelete(BtCursor *pCur){
){
dropCell(pLeafPage, 0, szNext);
VVA_ONLY( leafCur.pagesShuffled = 0 );
rc = balance(&leafCur, 0);
rc = balance(&leafCur);
assert( leafCursorInvalid || !leafCur.pagesShuffled
|| !pCur->pagesShuffled );
}
@ -6553,7 +6580,7 @@ int sqlite3BtreeDelete(BtCursor *pCur){
pCur->pgnoRoot, pPage->pgno));
rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
if( rc==SQLITE_OK ){
rc = balance(pCur, 0);
rc = balance(pCur);
}
}
if( rc==SQLITE_OK ){