ae2e7fcc4c
FossilOrigin-Name: d71154265a294a1ece89d257f55f6855db7c30aec55ea0dc4eeb61bce1e8fad3
998 lines
33 KiB
C
998 lines
33 KiB
C
/*
|
|
** 2011-08-18
|
|
**
|
|
** The author disclaims copyright to this source code. In place of
|
|
** a legal notice, here is a blessing:
|
|
**
|
|
** May you do good and not evil.
|
|
** May you find forgiveness for yourself and forgive others.
|
|
** May you share freely, never taking more than you give.
|
|
**
|
|
*************************************************************************
|
|
** Internal structure definitions for the LSM module.
|
|
*/
|
|
#ifndef _LSM_INT_H
|
|
#define _LSM_INT_H
|
|
|
|
#include "lsm.h"
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
|
|
#include <stdarg.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
|
|
#ifdef _WIN32
|
|
# ifdef _MSC_VER
|
|
# define snprintf _snprintf
|
|
# endif
|
|
#else
|
|
# include <unistd.h>
|
|
#endif
|
|
|
|
#ifdef NDEBUG
|
|
# ifdef LSM_DEBUG_EXPENSIVE
|
|
# undef LSM_DEBUG_EXPENSIVE
|
|
# endif
|
|
# ifdef LSM_DEBUG
|
|
# undef LSM_DEBUG
|
|
# endif
|
|
#else
|
|
# ifndef LSM_DEBUG
|
|
# define LSM_DEBUG
|
|
# endif
|
|
#endif
|
|
|
|
/* #define LSM_DEBUG_EXPENSIVE 1 */
|
|
|
|
/*
|
|
** Default values for various data structure parameters. These may be
|
|
** overridden by calls to lsm_config().
|
|
*/
|
|
#define LSM_DFLT_PAGE_SIZE (4 * 1024)
|
|
#define LSM_DFLT_BLOCK_SIZE (1 * 1024 * 1024)
|
|
#define LSM_DFLT_AUTOFLUSH (1 * 1024 * 1024)
|
|
#define LSM_DFLT_AUTOCHECKPOINT (i64)(2 * 1024 * 1024)
|
|
#define LSM_DFLT_AUTOWORK 1
|
|
#define LSM_DFLT_LOG_SIZE (128*1024)
|
|
#define LSM_DFLT_AUTOMERGE 4
|
|
#define LSM_DFLT_SAFETY LSM_SAFETY_NORMAL
|
|
#define LSM_DFLT_MMAP (LSM_IS_64_BIT ? 1 : 32768)
|
|
#define LSM_DFLT_MULTIPLE_PROCESSES 1
|
|
#define LSM_DFLT_USE_LOG 1
|
|
|
|
/* Initial values for log file checksums. These are only used if the
|
|
** database file does not contain a valid checkpoint. */
|
|
#define LSM_CKSUM0_INIT 42
|
|
#define LSM_CKSUM1_INIT 42
|
|
|
|
/* "mmap" mode is currently only used in environments with 64-bit address
|
|
** spaces. The following macro is used to test for this. */
|
|
#define LSM_IS_64_BIT (sizeof(void*)==8)
|
|
|
|
#define LSM_AUTOWORK_QUANT 32
|
|
|
|
typedef struct Database Database;
|
|
typedef struct DbLog DbLog;
|
|
typedef struct FileSystem FileSystem;
|
|
typedef struct Freelist Freelist;
|
|
typedef struct FreelistEntry FreelistEntry;
|
|
typedef struct Level Level;
|
|
typedef struct LogMark LogMark;
|
|
typedef struct LogRegion LogRegion;
|
|
typedef struct LogWriter LogWriter;
|
|
typedef struct LsmString LsmString;
|
|
typedef struct Mempool Mempool;
|
|
typedef struct Merge Merge;
|
|
typedef struct MergeInput MergeInput;
|
|
typedef struct MetaPage MetaPage;
|
|
typedef struct MultiCursor MultiCursor;
|
|
typedef struct Page Page;
|
|
typedef struct Redirect Redirect;
|
|
typedef struct Segment Segment;
|
|
typedef struct SegmentMerger SegmentMerger;
|
|
typedef struct ShmChunk ShmChunk;
|
|
typedef struct ShmHeader ShmHeader;
|
|
typedef struct ShmReader ShmReader;
|
|
typedef struct Snapshot Snapshot;
|
|
typedef struct TransMark TransMark;
|
|
typedef struct Tree Tree;
|
|
typedef struct TreeCursor TreeCursor;
|
|
typedef struct TreeHeader TreeHeader;
|
|
typedef struct TreeMark TreeMark;
|
|
typedef struct TreeRoot TreeRoot;
|
|
|
|
#ifndef _SQLITEINT_H_
|
|
typedef unsigned char u8;
|
|
typedef unsigned short int u16;
|
|
typedef unsigned int u32;
|
|
typedef lsm_i64 i64;
|
|
typedef unsigned long long int u64;
|
|
#endif
|
|
|
|
/* A page number is a 64-bit integer. */
|
|
typedef i64 LsmPgno;
|
|
|
|
#ifdef LSM_DEBUG
|
|
int lsmErrorBkpt(int);
|
|
#else
|
|
# define lsmErrorBkpt(x) (x)
|
|
#endif
|
|
|
|
#define LSM_PROTOCOL_BKPT lsmErrorBkpt(LSM_PROTOCOL)
|
|
#define LSM_IOERR_BKPT lsmErrorBkpt(LSM_IOERR)
|
|
#define LSM_NOMEM_BKPT lsmErrorBkpt(LSM_NOMEM)
|
|
#define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT)
|
|
#define LSM_MISUSE_BKPT lsmErrorBkpt(LSM_MISUSE)
|
|
|
|
#define unused_parameter(x) (void)(x)
|
|
#define array_size(x) (sizeof(x)/sizeof(x[0]))
|
|
|
|
|
|
/* The size of each shared-memory chunk */
|
|
#define LSM_SHM_CHUNK_SIZE (32*1024)
|
|
|
|
/* The number of bytes reserved at the start of each shm chunk for MM. */
|
|
#define LSM_SHM_CHUNK_HDR (sizeof(ShmChunk))
|
|
|
|
/* The number of available read locks. */
|
|
#define LSM_LOCK_NREADER 6
|
|
|
|
/* The number of available read-write client locks. */
|
|
#define LSM_LOCK_NRWCLIENT 16
|
|
|
|
/* Lock definitions.
|
|
*/
|
|
#define LSM_LOCK_DMS1 1 /* Serialize connect/disconnect ops */
|
|
#define LSM_LOCK_DMS2 2 /* Read-write connections */
|
|
#define LSM_LOCK_DMS3 3 /* Read-only connections */
|
|
#define LSM_LOCK_WRITER 4
|
|
#define LSM_LOCK_WORKER 5
|
|
#define LSM_LOCK_CHECKPOINTER 6
|
|
#define LSM_LOCK_ROTRANS 7
|
|
#define LSM_LOCK_READER(i) ((i) + LSM_LOCK_ROTRANS + 1)
|
|
#define LSM_LOCK_RWCLIENT(i) ((i) + LSM_LOCK_READER(LSM_LOCK_NREADER))
|
|
|
|
#define LSM_N_LOCK LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT)
|
|
|
|
/*
|
|
** Meta-page size and usable size.
|
|
*/
|
|
#define LSM_META_PAGE_SIZE 4096
|
|
|
|
#define LSM_META_RW_PAGE_SIZE (LSM_META_PAGE_SIZE - LSM_N_LOCK)
|
|
|
|
/*
|
|
** Hard limit on the number of free-list entries that may be stored in
|
|
** a checkpoint (the remainder are stored as a system record in the LSM).
|
|
** See also LSM_CONFIG_MAX_FREELIST.
|
|
*/
|
|
#define LSM_MAX_FREELIST_ENTRIES 24
|
|
|
|
#define LSM_MAX_BLOCK_REDIRECTS 16
|
|
|
|
#define LSM_ATTEMPTS_BEFORE_PROTOCOL 10000
|
|
|
|
|
|
/*
|
|
** Each entry stored in the LSM (or in-memory tree structure) has an
|
|
** associated mask of the following flags.
|
|
*/
|
|
#define LSM_START_DELETE 0x01 /* Start of open-ended delete range */
|
|
#define LSM_END_DELETE 0x02 /* End of open-ended delete range */
|
|
#define LSM_POINT_DELETE 0x04 /* Delete this key */
|
|
#define LSM_INSERT 0x08 /* Insert this key and value */
|
|
#define LSM_SEPARATOR 0x10 /* True if entry is separator key only */
|
|
#define LSM_SYSTEMKEY 0x20 /* True if entry is a system key (FREELIST) */
|
|
|
|
#define LSM_CONTIGUOUS 0x40 /* Used in lsm_tree.c */
|
|
|
|
/*
|
|
** A string that can grow by appending.
|
|
*/
|
|
struct LsmString {
|
|
lsm_env *pEnv; /* Run-time environment */
|
|
int n; /* Size of string. -1 indicates error */
|
|
int nAlloc; /* Space allocated for z[] */
|
|
char *z; /* The string content */
|
|
};
|
|
|
|
typedef struct LsmFile LsmFile;
|
|
struct LsmFile {
|
|
lsm_file *pFile;
|
|
LsmFile *pNext;
|
|
};
|
|
|
|
/*
|
|
** An instance of the following type is used to store an ordered list of
|
|
** u32 values.
|
|
**
|
|
** Note: This is a place-holder implementation. It should be replaced by
|
|
** a version that avoids making a single large allocation when the array
|
|
** contains a large number of values. For this reason, the internals of
|
|
** this object should only manipulated by the intArrayXXX() functions in
|
|
** lsm_tree.c.
|
|
*/
|
|
typedef struct IntArray IntArray;
|
|
struct IntArray {
|
|
int nAlloc;
|
|
int nArray;
|
|
u32 *aArray;
|
|
};
|
|
|
|
struct Redirect {
|
|
int n; /* Number of redirects */
|
|
struct RedirectEntry {
|
|
int iFrom;
|
|
int iTo;
|
|
} *a;
|
|
};
|
|
|
|
/*
|
|
** An instance of this structure represents a point in the history of the
|
|
** tree structure to roll back to. Refer to comments in lsm_tree.c for
|
|
** details.
|
|
*/
|
|
struct TreeMark {
|
|
u32 iRoot; /* Offset of root node in shm file */
|
|
u32 nHeight; /* Current height of tree structure */
|
|
u32 iWrite; /* Write offset in shm file */
|
|
u32 nChunk; /* Number of chunks in shared-memory file */
|
|
u32 iFirst; /* First chunk in linked list */
|
|
u32 iNextShmid; /* Next id to allocate */
|
|
int iRollback; /* Index in lsm->rollback to revert to */
|
|
};
|
|
|
|
/*
|
|
** An instance of this structure represents a point in the database log.
|
|
*/
|
|
struct LogMark {
|
|
i64 iOff; /* Offset into log (see lsm_log.c) */
|
|
int nBuf; /* Size of in-memory buffer here */
|
|
u8 aBuf[8]; /* Bytes of content in aBuf[] */
|
|
u32 cksum0; /* Checksum 0 at offset (iOff-nBuf) */
|
|
u32 cksum1; /* Checksum 1 at offset (iOff-nBuf) */
|
|
};
|
|
|
|
struct TransMark {
|
|
TreeMark tree;
|
|
LogMark log;
|
|
};
|
|
|
|
/*
|
|
** A structure that defines the start and end offsets of a region in the
|
|
** log file. The size of the region in bytes is (iEnd - iStart), so if
|
|
** iEnd==iStart the region is zero bytes in size.
|
|
*/
|
|
struct LogRegion {
|
|
i64 iStart; /* Start of region in log file */
|
|
i64 iEnd; /* End of region in log file */
|
|
};
|
|
|
|
struct DbLog {
|
|
u32 cksum0; /* Checksum 0 at offset iOff */
|
|
u32 cksum1; /* Checksum 1 at offset iOff */
|
|
i64 iSnapshotId; /* Log space has been reclaimed to this ss */
|
|
LogRegion aRegion[3]; /* Log file regions (see docs in lsm_log.c) */
|
|
};
|
|
|
|
struct TreeRoot {
|
|
u32 iRoot;
|
|
u32 nHeight;
|
|
u32 nByte; /* Total size of this tree in bytes */
|
|
u32 iTransId;
|
|
};
|
|
|
|
/*
|
|
** Tree header structure.
|
|
*/
|
|
struct TreeHeader {
|
|
u32 iUsedShmid; /* Id of first shm chunk used by this tree */
|
|
u32 iNextShmid; /* Shm-id of next chunk allocated */
|
|
u32 iFirst; /* Chunk number of smallest shm-id */
|
|
u32 nChunk; /* Number of chunks in shared-memory file */
|
|
TreeRoot root; /* Root and height of current tree */
|
|
u32 iWrite; /* Write offset in shm file */
|
|
TreeRoot oldroot; /* Root and height of the previous tree */
|
|
u32 iOldShmid; /* Last shm-id used by previous tree */
|
|
u32 iUsrVersion; /* get/set_user_version() value */
|
|
i64 iOldLog; /* Log offset associated with old tree */
|
|
u32 oldcksum0;
|
|
u32 oldcksum1;
|
|
DbLog log; /* Current layout of log file */
|
|
u32 aCksum[2]; /* Checksums 1 and 2. */
|
|
};
|
|
|
|
/*
|
|
** Database handle structure.
|
|
**
|
|
** mLock:
|
|
** A bitmask representing the locks currently held by the connection.
|
|
** An LSM database supports N distinct locks, where N is some number less
|
|
** than or equal to 32. Locks are numbered starting from 1 (see the
|
|
** definitions for LSM_LOCK_WRITER and co.).
|
|
**
|
|
** The least significant 32-bits in mLock represent EXCLUSIVE locks. The
|
|
** most significant are SHARED locks. So, if a connection holds a SHARED
|
|
** lock on lock region iLock, then the following is true:
|
|
**
|
|
** (mLock & ((iLock+32-1) << 1))
|
|
**
|
|
** Or for an EXCLUSIVE lock:
|
|
**
|
|
** (mLock & ((iLock-1) << 1))
|
|
**
|
|
** pCsr:
|
|
** Points to the head of a linked list that contains all currently open
|
|
** cursors. Once this list becomes empty, the user has no outstanding
|
|
** cursors and the database handle can be successfully closed.
|
|
**
|
|
** pCsrCache:
|
|
** This list contains cursor objects that have been closed using
|
|
** lsm_csr_close(). Each time a cursor is closed, it is shifted from
|
|
** the pCsr list to this list. When a new cursor is opened, this list
|
|
** is inspected to see if there exists a cursor object that can be
|
|
** reused. This is an optimization only.
|
|
*/
|
|
struct lsm_db {
|
|
|
|
/* Database handle configuration */
|
|
lsm_env *pEnv; /* runtime environment */
|
|
int (*xCmp)(void *, int, void *, int); /* Compare function */
|
|
|
|
/* Values configured by calls to lsm_config */
|
|
int eSafety; /* LSM_SAFETY_OFF, NORMAL or FULL */
|
|
int bAutowork; /* Configured by LSM_CONFIG_AUTOWORK */
|
|
int nTreeLimit; /* Configured by LSM_CONFIG_AUTOFLUSH */
|
|
int nMerge; /* Configured by LSM_CONFIG_AUTOMERGE */
|
|
int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */
|
|
int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */
|
|
int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */
|
|
int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */
|
|
int iMmap; /* Configured by LSM_CONFIG_MMAP */
|
|
i64 nAutockpt; /* Configured by LSM_CONFIG_AUTOCHECKPOINT */
|
|
int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */
|
|
int bReadonly; /* Configured by LSM_CONFIG_READONLY */
|
|
lsm_compress compress; /* Compression callbacks */
|
|
lsm_compress_factory factory; /* Compression callback factory */
|
|
|
|
/* Sub-system handles */
|
|
FileSystem *pFS; /* On-disk portion of database */
|
|
Database *pDatabase; /* Database shared data */
|
|
|
|
int iRwclient; /* Read-write client lock held (-1 == none) */
|
|
|
|
/* Client transaction context */
|
|
Snapshot *pClient; /* Client snapshot */
|
|
int iReader; /* Read lock held (-1 == unlocked) */
|
|
int bRoTrans; /* True if a read-only db trans is open */
|
|
MultiCursor *pCsr; /* List of all open cursors */
|
|
LogWriter *pLogWriter; /* Context for writing to the log file */
|
|
int nTransOpen; /* Number of opened write transactions */
|
|
int nTransAlloc; /* Allocated size of aTrans[] array */
|
|
TransMark *aTrans; /* Array of marks for transaction rollback */
|
|
IntArray rollback; /* List of tree-nodes to roll back */
|
|
int bDiscardOld; /* True if lsmTreeDiscardOld() was called */
|
|
|
|
MultiCursor *pCsrCache; /* List of all closed cursors */
|
|
|
|
/* Worker context */
|
|
Snapshot *pWorker; /* Worker snapshot (or NULL) */
|
|
Freelist *pFreelist; /* See sortedNewToplevel() */
|
|
int bUseFreelist; /* True to use pFreelist */
|
|
int bIncrMerge; /* True if currently doing a merge */
|
|
|
|
int bInFactory; /* True if within factory.xFactory() */
|
|
|
|
/* Debugging message callback */
|
|
void (*xLog)(void *, int, const char *);
|
|
void *pLogCtx;
|
|
|
|
/* Work done notification callback */
|
|
void (*xWork)(lsm_db *, void *);
|
|
void *pWorkCtx;
|
|
|
|
u64 mLock; /* Mask of current locks. See lsmShmLock(). */
|
|
lsm_db *pNext; /* Next connection to same database */
|
|
|
|
int nShm; /* Size of apShm[] array */
|
|
void **apShm; /* Shared memory chunks */
|
|
ShmHeader *pShmhdr; /* Live shared-memory header */
|
|
TreeHeader treehdr; /* Local copy of tree-header */
|
|
u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)];
|
|
};
|
|
|
|
struct Segment {
|
|
LsmPgno iFirst; /* First page of this run */
|
|
LsmPgno iLastPg; /* Last page of this run */
|
|
LsmPgno iRoot; /* Root page number (if any) */
|
|
LsmPgno nSize; /* Size of this run in pages */
|
|
|
|
Redirect *pRedirect; /* Block redirects (or NULL) */
|
|
};
|
|
|
|
/*
|
|
** iSplitTopic/pSplitKey/nSplitKey:
|
|
** If nRight>0, this buffer contains a copy of the largest key that has
|
|
** already been written to the left-hand-side of the level.
|
|
*/
|
|
struct Level {
|
|
Segment lhs; /* Left-hand (main) segment */
|
|
int nRight; /* Size of apRight[] array */
|
|
Segment *aRhs; /* Old segments being merged into this */
|
|
int iSplitTopic; /* Split key topic (if nRight>0) */
|
|
void *pSplitKey; /* Pointer to split-key (if nRight>0) */
|
|
int nSplitKey; /* Number of bytes in split-key */
|
|
|
|
u16 iAge; /* Number of times data has been written */
|
|
u16 flags; /* Mask of LEVEL_XXX bits */
|
|
Merge *pMerge; /* Merge operation currently underway */
|
|
Level *pNext; /* Next level in tree */
|
|
};
|
|
|
|
/*
|
|
** The Level.flags field is set to a combination of the following bits.
|
|
**
|
|
** LEVEL_FREELIST_ONLY:
|
|
** Set if the level consists entirely of free-list entries.
|
|
**
|
|
** LEVEL_INCOMPLETE:
|
|
** This is set while a new toplevel level is being constructed. It is
|
|
** never set for any level other than a new toplevel.
|
|
*/
|
|
#define LEVEL_FREELIST_ONLY 0x0001
|
|
#define LEVEL_INCOMPLETE 0x0002
|
|
|
|
|
|
/*
|
|
** A structure describing an ongoing merge. There is an instance of this
|
|
** structure for every Level currently undergoing a merge in the worker
|
|
** snapshot.
|
|
**
|
|
** It is assumed that code that uses an instance of this structure has
|
|
** access to the associated Level struct.
|
|
**
|
|
** iOutputOff:
|
|
** The byte offset to write to next within the last page of the
|
|
** output segment.
|
|
*/
|
|
struct MergeInput {
|
|
LsmPgno iPg; /* Page on which next input is stored */
|
|
int iCell; /* Cell containing next input to merge */
|
|
};
|
|
struct Merge {
|
|
int nInput; /* Number of input runs being merged */
|
|
MergeInput *aInput; /* Array nInput entries in size */
|
|
MergeInput splitkey; /* Location in file of current splitkey */
|
|
int nSkip; /* Number of separators entries to skip */
|
|
int iOutputOff; /* Write offset on output page */
|
|
LsmPgno iCurrentPtr; /* Current pointer value */
|
|
};
|
|
|
|
/*
|
|
** The first argument to this macro is a pointer to a Segment structure.
|
|
** Returns true if the structure instance indicates that the separators
|
|
** array is valid.
|
|
*/
|
|
#define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0)
|
|
|
|
/*
|
|
** The values that accompany the lock held by a database reader.
|
|
*/
|
|
struct ShmReader {
|
|
u32 iTreeId;
|
|
i64 iLsmId;
|
|
};
|
|
|
|
/*
|
|
** An instance of this structure is stored in the first shared-memory
|
|
** page. The shared-memory header.
|
|
**
|
|
** bWriter:
|
|
** Immediately after opening a write transaction taking the WRITER lock,
|
|
** each writer client sets this flag. It is cleared right before the
|
|
** WRITER lock is relinquished. If a subsequent writer finds that this
|
|
** flag is already set when a write transaction is opened, this indicates
|
|
** that a previous writer failed mid-transaction.
|
|
**
|
|
** iMetaPage:
|
|
** If the database file does not contain a valid, synced, checkpoint, this
|
|
** value is set to 0. Otherwise, it is set to the meta-page number that
|
|
** contains the most recently written checkpoint (either 1 or 2).
|
|
**
|
|
** hdr1, hdr2:
|
|
** The two copies of the in-memory tree header. Two copies are required
|
|
** in case a writer fails while updating one of them.
|
|
*/
|
|
struct ShmHeader {
|
|
u32 aSnap1[LSM_META_PAGE_SIZE / 4];
|
|
u32 aSnap2[LSM_META_PAGE_SIZE / 4];
|
|
u32 bWriter;
|
|
u32 iMetaPage;
|
|
TreeHeader hdr1;
|
|
TreeHeader hdr2;
|
|
ShmReader aReader[LSM_LOCK_NREADER];
|
|
};
|
|
|
|
/*
|
|
** An instance of this structure is stored at the start of each shared-memory
|
|
** chunk except the first (which is the header chunk - see above).
|
|
*/
|
|
struct ShmChunk {
|
|
u32 iShmid;
|
|
u32 iNext;
|
|
};
|
|
|
|
/*
|
|
** Maximum number of shared-memory chunks allowed in the *-shm file. Since
|
|
** each shared-memory chunk is 32KB in size, this is a theoretical limit only.
|
|
*/
|
|
#define LSM_MAX_SHMCHUNKS (1<<30)
|
|
|
|
/* Return true if shm-sequence "a" is larger than or equal to "b" */
|
|
#define shm_sequence_ge(a, b) (((u32)a-(u32)b) < LSM_MAX_SHMCHUNKS)
|
|
|
|
#define LSM_APPLIST_SZ 4
|
|
|
|
/*
|
|
** An instance of the following structure stores the in-memory part of
|
|
** the current free block list. This structure is to the free block list
|
|
** as the in-memory tree is to the users database content. The contents
|
|
** of the free block list is found by merging the in-memory components
|
|
** with those stored in the LSM, just as the contents of the database is
|
|
** found by merging the in-memory tree with the user data entries in the
|
|
** LSM.
|
|
**
|
|
** Each FreelistEntry structure in the array represents either an insert
|
|
** or delete operation on the free-list. For deletes, the FreelistEntry.iId
|
|
** field is set to -1. For inserts, it is set to zero or greater.
|
|
**
|
|
** The array of FreelistEntry structures is always sorted in order of
|
|
** block number (ascending).
|
|
**
|
|
** When the in-memory free block list is written into the LSM, each insert
|
|
** operation is written separately. The entry key is the bitwise inverse
|
|
** of the block number as a 32-bit big-endian integer. This is done so that
|
|
** the entries in the LSM are sorted in descending order of block id.
|
|
** The associated value is the snapshot id, formated as a varint.
|
|
*/
|
|
struct Freelist {
|
|
FreelistEntry *aEntry; /* Free list entries */
|
|
int nEntry; /* Number of valid slots in aEntry[] */
|
|
int nAlloc; /* Allocated size of aEntry[] */
|
|
};
|
|
struct FreelistEntry {
|
|
u32 iBlk; /* Block number */
|
|
i64 iId; /* Largest snapshot id to use this block */
|
|
};
|
|
|
|
/*
|
|
** A snapshot of a database. A snapshot contains all the information required
|
|
** to read or write a database file on disk. See the description of struct
|
|
** Database below for futher details.
|
|
*/
|
|
struct Snapshot {
|
|
Database *pDatabase; /* Database this snapshot belongs to */
|
|
u32 iCmpId; /* Id of compression scheme */
|
|
Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */
|
|
i64 iId; /* Snapshot id */
|
|
i64 iLogOff; /* Log file offset */
|
|
Redirect redirect; /* Block redirection array */
|
|
|
|
/* Used by worker snapshots only */
|
|
int nBlock; /* Number of blocks in database file */
|
|
LsmPgno aiAppend[LSM_APPLIST_SZ]; /* Append point list */
|
|
Freelist freelist; /* Free block list */
|
|
u32 nWrite; /* Total number of pages written to disk */
|
|
};
|
|
#define LSM_INITIAL_SNAPSHOT_ID 11
|
|
|
|
/*
|
|
** Functions from file "lsm_ckpt.c".
|
|
*/
|
|
int lsmCheckpointWrite(lsm_db *, u32 *);
|
|
int lsmCheckpointLevels(lsm_db *, int, void **, int *);
|
|
int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal);
|
|
|
|
int lsmCheckpointRecover(lsm_db *);
|
|
int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **);
|
|
|
|
int lsmCheckpointLoadWorker(lsm_db *pDb);
|
|
int lsmCheckpointStore(lsm_db *pDb, int);
|
|
|
|
int lsmCheckpointLoad(lsm_db *pDb, int *);
|
|
int lsmCheckpointLoadOk(lsm_db *pDb, int);
|
|
int lsmCheckpointClientCacheOk(lsm_db *);
|
|
|
|
u32 lsmCheckpointNBlock(u32 *);
|
|
i64 lsmCheckpointId(u32 *, int);
|
|
u32 lsmCheckpointNWrite(u32 *, int);
|
|
i64 lsmCheckpointLogOffset(u32 *);
|
|
int lsmCheckpointPgsz(u32 *);
|
|
int lsmCheckpointBlksz(u32 *);
|
|
void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog);
|
|
void lsmCheckpointZeroLogoffset(lsm_db *);
|
|
|
|
int lsmCheckpointSaveWorker(lsm_db *pDb, int);
|
|
int lsmDatabaseFull(lsm_db *pDb);
|
|
int lsmCheckpointSynced(lsm_db *pDb, i64 *piId, i64 *piLog, u32 *pnWrite);
|
|
|
|
int lsmCheckpointSize(lsm_db *db, int *pnByte);
|
|
|
|
int lsmInfoCompressionId(lsm_db *db, u32 *piCmpId);
|
|
|
|
/*
|
|
** Functions from file "lsm_tree.c".
|
|
*/
|
|
int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree);
|
|
void lsmTreeRelease(lsm_env *, Tree *);
|
|
int lsmTreeInit(lsm_db *);
|
|
int lsmTreeRepair(lsm_db *);
|
|
|
|
void lsmTreeMakeOld(lsm_db *pDb);
|
|
void lsmTreeDiscardOld(lsm_db *pDb);
|
|
int lsmTreeHasOld(lsm_db *pDb);
|
|
|
|
int lsmTreeSize(lsm_db *);
|
|
int lsmTreeEndTransaction(lsm_db *pDb, int bCommit);
|
|
int lsmTreeLoadHeader(lsm_db *pDb, int *);
|
|
int lsmTreeLoadHeaderOk(lsm_db *, int);
|
|
|
|
int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal);
|
|
int lsmTreeDelete(lsm_db *db, void *pKey1, int nKey1, void *pKey2, int nKey2);
|
|
void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark);
|
|
void lsmTreeMark(lsm_db *pDb, TreeMark *pMark);
|
|
|
|
int lsmTreeCursorNew(lsm_db *pDb, int, TreeCursor **);
|
|
void lsmTreeCursorDestroy(TreeCursor *);
|
|
|
|
int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes);
|
|
int lsmTreeCursorNext(TreeCursor *pCsr);
|
|
int lsmTreeCursorPrev(TreeCursor *pCsr);
|
|
int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast);
|
|
void lsmTreeCursorReset(TreeCursor *pCsr);
|
|
int lsmTreeCursorKey(TreeCursor *pCsr, int *pFlags, void **ppKey, int *pnKey);
|
|
int lsmTreeCursorFlags(TreeCursor *pCsr);
|
|
int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal);
|
|
int lsmTreeCursorValid(TreeCursor *pCsr);
|
|
int lsmTreeCursorSave(TreeCursor *pCsr);
|
|
|
|
void lsmFlagsToString(int flags, char *zFlags);
|
|
|
|
/*
|
|
** Functions from file "mem.c".
|
|
*/
|
|
void *lsmMalloc(lsm_env*, size_t);
|
|
void lsmFree(lsm_env*, void *);
|
|
void *lsmRealloc(lsm_env*, void *, size_t);
|
|
void *lsmReallocOrFree(lsm_env*, void *, size_t);
|
|
void *lsmReallocOrFreeRc(lsm_env *, void *, size_t, int *);
|
|
|
|
void *lsmMallocZeroRc(lsm_env*, size_t, int *);
|
|
void *lsmMallocRc(lsm_env*, size_t, int *);
|
|
|
|
void *lsmMallocZero(lsm_env *pEnv, size_t);
|
|
char *lsmMallocStrdup(lsm_env *pEnv, const char *);
|
|
|
|
/*
|
|
** Functions from file "lsm_mutex.c".
|
|
*/
|
|
int lsmMutexStatic(lsm_env*, int, lsm_mutex **);
|
|
int lsmMutexNew(lsm_env*, lsm_mutex **);
|
|
void lsmMutexDel(lsm_env*, lsm_mutex *);
|
|
void lsmMutexEnter(lsm_env*, lsm_mutex *);
|
|
int lsmMutexTry(lsm_env*, lsm_mutex *);
|
|
void lsmMutexLeave(lsm_env*, lsm_mutex *);
|
|
|
|
#ifndef NDEBUG
|
|
int lsmMutexHeld(lsm_env *, lsm_mutex *);
|
|
int lsmMutexNotHeld(lsm_env *, lsm_mutex *);
|
|
#endif
|
|
|
|
/**************************************************************************
|
|
** Start of functions from "lsm_file.c".
|
|
*/
|
|
int lsmFsOpen(lsm_db *, const char *, int);
|
|
int lsmFsOpenLog(lsm_db *, int *);
|
|
void lsmFsCloseLog(lsm_db *);
|
|
void lsmFsClose(FileSystem *);
|
|
|
|
int lsmFsUnmap(FileSystem *);
|
|
|
|
int lsmFsConfigure(lsm_db *db);
|
|
|
|
int lsmFsBlockSize(FileSystem *);
|
|
void lsmFsSetBlockSize(FileSystem *, int);
|
|
int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom);
|
|
|
|
int lsmFsPageSize(FileSystem *);
|
|
void lsmFsSetPageSize(FileSystem *, int);
|
|
|
|
int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId);
|
|
|
|
/* Creating, populating, gobbling and deleting sorted runs. */
|
|
void lsmFsGobble(lsm_db *, Segment *, LsmPgno *, int);
|
|
int lsmFsSortedDelete(FileSystem *, Snapshot *, int, Segment *);
|
|
int lsmFsSortedFinish(FileSystem *, Segment *);
|
|
int lsmFsSortedAppend(FileSystem *, Snapshot *, Level *, int, Page **);
|
|
int lsmFsSortedPadding(FileSystem *, Snapshot *, Segment *);
|
|
|
|
/* Functions to retrieve the lsm_env pointer from a FileSystem or Page object */
|
|
lsm_env *lsmFsEnv(FileSystem *);
|
|
lsm_env *lsmPageEnv(Page *);
|
|
FileSystem *lsmPageFS(Page *);
|
|
|
|
int lsmFsSectorSize(FileSystem *);
|
|
|
|
void lsmSortedSplitkey(lsm_db *, Level *, int *);
|
|
|
|
/* Reading sorted run content. */
|
|
int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg);
|
|
int lsmFsDbPageGet(FileSystem *, Segment *, LsmPgno, Page **);
|
|
int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **);
|
|
|
|
u8 *lsmFsPageData(Page *, int *);
|
|
int lsmFsPageRelease(Page *);
|
|
int lsmFsPagePersist(Page *);
|
|
void lsmFsPageRef(Page *);
|
|
LsmPgno lsmFsPageNumber(Page *);
|
|
|
|
int lsmFsNRead(FileSystem *);
|
|
int lsmFsNWrite(FileSystem *);
|
|
|
|
int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **);
|
|
int lsmFsMetaPageRelease(MetaPage *);
|
|
u8 *lsmFsMetaPageData(MetaPage *, int *);
|
|
|
|
#ifdef LSM_DEBUG
|
|
int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg);
|
|
int lsmFsIntegrityCheck(lsm_db *);
|
|
#endif
|
|
|
|
LsmPgno lsmFsRedirectPage(FileSystem *, Redirect *, LsmPgno);
|
|
|
|
int lsmFsPageWritable(Page *);
|
|
|
|
/* Functions to read, write and sync the log file. */
|
|
int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr);
|
|
int lsmFsSyncLog(FileSystem *pFS);
|
|
int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr);
|
|
int lsmFsTruncateLog(FileSystem *pFS, i64 nByte);
|
|
int lsmFsTruncateDb(FileSystem *pFS, i64 nByte);
|
|
int lsmFsCloseAndDeleteLog(FileSystem *pFS);
|
|
|
|
LsmFile *lsmFsDeferClose(FileSystem *pFS);
|
|
|
|
/* And to sync the db file */
|
|
int lsmFsSyncDb(FileSystem *, int);
|
|
|
|
void lsmFsFlushWaiting(FileSystem *, int *);
|
|
|
|
/* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */
|
|
int lsmInfoArrayStructure(lsm_db *pDb, int bBlock, LsmPgno iFirst, char **pz);
|
|
int lsmInfoArrayPages(lsm_db *pDb, LsmPgno iFirst, char **pzOut);
|
|
int lsmConfigMmap(lsm_db *pDb, int *piParam);
|
|
|
|
int lsmEnvOpen(lsm_env *, const char *, int, lsm_file **);
|
|
int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile);
|
|
int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock);
|
|
int lsmEnvTestLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int nLock, int);
|
|
|
|
int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **);
|
|
void lsmEnvShmBarrier(lsm_env *);
|
|
void lsmEnvShmUnmap(lsm_env *, lsm_file *, int);
|
|
|
|
void lsmEnvSleep(lsm_env *, int);
|
|
|
|
int lsmFsReadSyncedId(lsm_db *db, int, i64 *piVal);
|
|
|
|
int lsmFsSegmentContainsPg(FileSystem *pFS, Segment *, LsmPgno, int *);
|
|
|
|
void lsmFsPurgeCache(FileSystem *);
|
|
|
|
/*
|
|
** End of functions from "lsm_file.c".
|
|
**************************************************************************/
|
|
|
|
/*
|
|
** Functions from file "lsm_sorted.c".
|
|
*/
|
|
int lsmInfoPageDump(lsm_db *, LsmPgno, int, char **);
|
|
void lsmSortedCleanup(lsm_db *);
|
|
int lsmSortedAutoWork(lsm_db *, int nUnit);
|
|
|
|
int lsmSortedWalkFreelist(lsm_db *, int, int (*)(void *, int, i64), void *);
|
|
|
|
int lsmSaveWorker(lsm_db *, int);
|
|
|
|
int lsmFlushTreeToDisk(lsm_db *pDb);
|
|
|
|
void lsmSortedRemap(lsm_db *pDb);
|
|
|
|
void lsmSortedFreeLevel(lsm_env *pEnv, Level *);
|
|
|
|
int lsmSortedAdvanceAll(lsm_db *pDb);
|
|
|
|
int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *);
|
|
int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *);
|
|
|
|
void *lsmSortedSplitKey(Level *pLevel, int *pnByte);
|
|
|
|
void lsmSortedSaveTreeCursors(lsm_db *);
|
|
|
|
int lsmMCursorNew(lsm_db *, MultiCursor **);
|
|
void lsmMCursorClose(MultiCursor *, int);
|
|
int lsmMCursorSeek(MultiCursor *, int, void *, int , int);
|
|
int lsmMCursorFirst(MultiCursor *);
|
|
int lsmMCursorPrev(MultiCursor *);
|
|
int lsmMCursorLast(MultiCursor *);
|
|
int lsmMCursorValid(MultiCursor *);
|
|
int lsmMCursorNext(MultiCursor *);
|
|
int lsmMCursorKey(MultiCursor *, void **, int *);
|
|
int lsmMCursorValue(MultiCursor *, void **, int *);
|
|
int lsmMCursorType(MultiCursor *, int *);
|
|
lsm_db *lsmMCursorDb(MultiCursor *);
|
|
void lsmMCursorFreeCache(lsm_db *);
|
|
|
|
int lsmSaveCursors(lsm_db *pDb);
|
|
int lsmRestoreCursors(lsm_db *pDb);
|
|
|
|
void lsmSortedDumpStructure(lsm_db *pDb, Snapshot *, int, int, const char *);
|
|
void lsmFsDumpBlocklists(lsm_db *);
|
|
|
|
void lsmSortedExpandBtreePage(Page *pPg, int nOrig);
|
|
|
|
void lsmPutU32(u8 *, u32);
|
|
u32 lsmGetU32(u8 *);
|
|
u64 lsmGetU64(u8 *);
|
|
|
|
/*
|
|
** Functions from "lsm_varint.c".
|
|
*/
|
|
int lsmVarintPut32(u8 *, int);
|
|
int lsmVarintGet32(u8 *, int *);
|
|
int lsmVarintPut64(u8 *aData, i64 iVal);
|
|
int lsmVarintGet64(const u8 *aData, i64 *piVal);
|
|
|
|
int lsmVarintLen64(i64);
|
|
|
|
int lsmVarintLen32(int);
|
|
int lsmVarintSize(u8 c);
|
|
|
|
/*
|
|
** Functions from file "main.c".
|
|
*/
|
|
void lsmLogMessage(lsm_db *, int, const char *, ...);
|
|
int lsmInfoFreelist(lsm_db *pDb, char **pzOut);
|
|
|
|
/*
|
|
** Functions from file "lsm_log.c".
|
|
*/
|
|
int lsmLogBegin(lsm_db *pDb);
|
|
int lsmLogWrite(lsm_db *, int, void *, int, void *, int);
|
|
int lsmLogCommit(lsm_db *);
|
|
void lsmLogEnd(lsm_db *pDb, int bCommit);
|
|
void lsmLogTell(lsm_db *, LogMark *);
|
|
void lsmLogSeek(lsm_db *, LogMark *);
|
|
void lsmLogClose(lsm_db *);
|
|
|
|
int lsmLogRecover(lsm_db *);
|
|
int lsmInfoLogStructure(lsm_db *pDb, char **pzVal);
|
|
|
|
/* Valid values for the second argument to lsmLogWrite(). */
|
|
#define LSM_WRITE 0x06
|
|
#define LSM_DELETE 0x08
|
|
#define LSM_DRANGE 0x0A
|
|
|
|
/**************************************************************************
|
|
** Functions from file "lsm_shared.c".
|
|
*/
|
|
|
|
int lsmDbDatabaseConnect(lsm_db*, const char *);
|
|
void lsmDbDatabaseRelease(lsm_db *);
|
|
|
|
int lsmBeginReadTrans(lsm_db *);
|
|
int lsmBeginWriteTrans(lsm_db *);
|
|
int lsmBeginFlush(lsm_db *);
|
|
|
|
int lsmDetectRoTrans(lsm_db *db, int *);
|
|
int lsmBeginRoTrans(lsm_db *db);
|
|
|
|
int lsmBeginWork(lsm_db *);
|
|
void lsmFinishWork(lsm_db *, int, int *);
|
|
|
|
int lsmFinishRecovery(lsm_db *);
|
|
void lsmFinishReadTrans(lsm_db *);
|
|
int lsmFinishWriteTrans(lsm_db *, int);
|
|
int lsmFinishFlush(lsm_db *, int);
|
|
|
|
int lsmSnapshotSetFreelist(lsm_db *, int *, int);
|
|
|
|
Snapshot *lsmDbSnapshotClient(lsm_db *);
|
|
Snapshot *lsmDbSnapshotWorker(lsm_db *);
|
|
|
|
void lsmSnapshotSetCkptid(Snapshot *, i64);
|
|
|
|
Level *lsmDbSnapshotLevel(Snapshot *);
|
|
void lsmDbSnapshotSetLevel(Snapshot *, Level *);
|
|
|
|
void lsmDbRecoveryComplete(lsm_db *, int);
|
|
|
|
int lsmBlockAllocate(lsm_db *, int, int *);
|
|
int lsmBlockFree(lsm_db *, int);
|
|
int lsmBlockRefree(lsm_db *, int);
|
|
|
|
void lsmFreelistDeltaBegin(lsm_db *);
|
|
void lsmFreelistDeltaEnd(lsm_db *);
|
|
int lsmFreelistDelta(lsm_db *pDb);
|
|
|
|
DbLog *lsmDatabaseLog(lsm_db *pDb);
|
|
|
|
#ifdef LSM_DEBUG
|
|
int lsmHoldingClientMutex(lsm_db *pDb);
|
|
int lsmShmAssertLock(lsm_db *db, int iLock, int eOp);
|
|
int lsmShmAssertWorker(lsm_db *db);
|
|
#endif
|
|
|
|
void lsmFreeSnapshot(lsm_env *, Snapshot *);
|
|
|
|
|
|
/* Candidate values for the 3rd argument to lsmShmLock() */
|
|
#define LSM_LOCK_UNLOCK 0
|
|
#define LSM_LOCK_SHARED 1
|
|
#define LSM_LOCK_EXCL 2
|
|
|
|
int lsmShmCacheChunks(lsm_db *db, int nChunk);
|
|
int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock);
|
|
int lsmShmTestLock(lsm_db *db, int iLock, int nLock, int eOp);
|
|
void lsmShmBarrier(lsm_db *db);
|
|
|
|
#ifdef LSM_DEBUG
|
|
void lsmShmHasLock(lsm_db *db, int iLock, int eOp);
|
|
#else
|
|
# define lsmShmHasLock(x,y,z)
|
|
#endif
|
|
|
|
int lsmReadlock(lsm_db *, i64 iLsm, u32 iShmMin, u32 iShmMax);
|
|
|
|
int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse);
|
|
int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse);
|
|
int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId);
|
|
|
|
int lsmDbMultiProc(lsm_db *);
|
|
void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *);
|
|
LsmFile *lsmDbRecycleFd(lsm_db *);
|
|
|
|
int lsmWalkFreelist(lsm_db *, int, int (*)(void *, int, i64), void *);
|
|
|
|
int lsmCheckCompressionId(lsm_db *, u32);
|
|
|
|
|
|
/**************************************************************************
|
|
** functions in lsm_str.c
|
|
*/
|
|
void lsmStringInit(LsmString*, lsm_env *pEnv);
|
|
int lsmStringExtend(LsmString*, int);
|
|
int lsmStringAppend(LsmString*, const char *, int);
|
|
void lsmStringVAppendf(LsmString*, const char *zFormat, va_list, va_list);
|
|
void lsmStringAppendf(LsmString*, const char *zFormat, ...);
|
|
void lsmStringClear(LsmString*);
|
|
char *lsmMallocPrintf(lsm_env*, const char*, ...);
|
|
int lsmStringBinAppend(LsmString *pStr, const u8 *a, int n);
|
|
|
|
int lsmStrlen(const char *zName);
|
|
|
|
|
|
|
|
/*
|
|
** Round up a number to the next larger multiple of 8. This is used
|
|
** to force 8-byte alignment on 64-bit architectures.
|
|
*/
|
|
#define ROUND8(x) (((x)+7)&~7)
|
|
|
|
#define LSM_MIN(x,y) ((x)>(y) ? (y) : (x))
|
|
#define LSM_MAX(x,y) ((x)>(y) ? (x) : (y))
|
|
|
|
#endif
|