sqlite/src/log.c


/*
** This file contains the implementation of a log file used in
** "journal_mode=wal" mode.
*/

/*
** LOG FILE FORMAT
**
** A log file consists of a header followed by zero or more log frames.
** The log header is 12 bytes in size and consists of the following three
** big-endian 32-bit unsigned integer values:
**
**     0: Database page size,
**     4: Randomly selected salt value 1,
**     8: Randomly selected salt value 2.
**
** Immediately following the log header are zero or more log frames. Each
** frame itself consists of a 16-byte header followed by a <page-size> bytes
** of page data. The header is broken into 4 big-endian 32-bit unsigned
** integer values, as follows:
**
**     0: Page number.
**     4: For commit records, the size of the database image in pages
**        after the commit. For all other records, zero.
**     8: Checksum value 1.
**    12: Checksum value 2.
*/

/*
** LOG SUMMARY FORMAT
**
** TODO.
*/

#include "log.h"

#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

typedef struct LogSummaryHdr LogSummaryHdr;
typedef struct LogSummary LogSummary;
typedef struct LogIterator LogIterator;
typedef struct LogLock LogLock;


/*
** The following structure may be used to store the same data that
** is stored in the log-summary header.
**
** Member variables iCheck1 and iCheck2 contain the checksum for the
** last frame written to the log, or 2 and 3 respectively if the log
** is currently empty.
*/
struct LogSummaryHdr {
  u32 iChange;                    /* Counter incremented each transaction */
  u32 pgsz;                       /* Database page size in bytes */
  u32 iLastPg;                    /* Address of last valid frame in log */
  u32 nPage;                      /* Size of database in pages */
  u32 iCheck1;                    /* Checkpoint value 1 */
  u32 iCheck2;                    /* Checkpoint value 2 */
};

/* Size of serialized LogSummaryHdr object. */
#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))

#define LOGSUMMARY_FRAME_OFFSET \
  (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32))


/* Size of frame header */
#define LOG_FRAME_HDRSIZE 16
#define LOG_HDRSIZE       12

/*
** Return the offset of frame iFrame in the log file, assuming a database
** page size of pgsz bytes. The offset returned is to the start of the
** log frame-header.
*/
#define logFrameOffset(iFrame, pgsz) (                               \
  LOG_HDRSIZE + ((iFrame)-1)*((pgsz)+LOG_FRAME_HDRSIZE)              \
)

/*
** There is one instance of this structure for each log-summary object
** that this process has a connection to. They are stored in a linked
** list starting at pLogSummary (global variable).
**
** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
**       directly in this implementation because the VFS does not support
**       the required blocking file-locks.
*/
struct LogSummary {
  sqlite3_mutex *mutex;           /* Mutex used to protect this object */
  int nRef;                       /* Number of pointers to this structure */
  int fd;                         /* File descriptor open on log-summary */
  char *zPath;                    /* Path to associated WAL file */
  LogLock *pLock;                 /* Linked list of locks on this object */
  LogSummary *pNext;              /* Next in global list */
  int nData;                      /* Size of aData allocation/mapping */
  u32 *aData;                     /* File body */
};

/*
** This module uses three different types of file-locks. All are taken
** on the log-summary file. The three types of locks are as follows:
**
** MUTEX:  The MUTEX lock is used as a robust inter-process mutex. It
**         is held while the log-summary header is modified, and
**         sometimes when it is read. It is also held while a new client
**         obtains the DMH lock (see below), and while log recovery is
**         being run.
**
** DMH:    The DMH (Dead Mans Hand mechanism) lock is used to ensure
**         that log-recovery is always run following a system restart.
**         When it first opens a log-summary file, a process takes a
**         SHARED lock on the DMH region. This lock is not released until
**         the log-summary file is closed.
**
**         The process then attempts to upgrade to an EXCLUSIVE lock. If
**         successful, then the contents of the log-summary file are deemed
**         suspect and the log-summary header zeroed. This forces the
**         first process that reads the log-summary file to run log
**         recovery. After zeroing the log-summary header, the process
**         downgrades to a SHARED lock on the DMH region.
**
**         If the attempt to obtain the EXCLUSIVE lock fails, then the
**         process concludes that some other process is already using the
**         log-summary file, and it can therefore be trusted.
**
**         The procedure described in the previous three paragraphs (taking
**         a SHARED lock and then upgrading to an EXCLUSIVE lock to check
**         if the process is the only one to have an open connection to the
**         log file) is protected by holding the MUTEX lock. This avoids the
**         race condition wherein the first two clients connect almost
**         simultaneously following a system restart and each prevents
**         the other from obtaining the EXCLUSIVE lock.
**
**
** REGION: There are 4 different region locks, regions A, B, C and D.
**         Various EXCLUSIVE and SHARED locks on these regions are obtained
**         when a client reads, writes or checkpoints the database.
**
**    To obtain a reader lock:
**
**         1. Attempt a SHARED lock on regions A and B.
**         2. If step 1 is successful, drop the lock on region B. Or, if
**            it is unsuccessful, attempt a SHARED lock on region D.
**         3. Repeat the above until the lock attempt in step 1 or 2 is
**            successful.
**
**         The reader lock is released when the read transaction is finished.
**
**    To obtain a writer lock:
**
**         1. Take (wait for) an EXCLUSIVE lock on regions C and D.
**
**         The locks are released after the write transaction is finished
**         and, if any frames were committed to the log, the log-summary
**         file updated.
**
**    To obtain a checkpointer lock:
**
**         1. Take (wait for) an EXCLUSIVE lock on regions B and C.
**         2. Take (wait for) an EXCLUSIVE lock on region A.
**
**         Step 1 waits until any existing writer has finished. And forces
**         all new readers to become "region D" readers.
**
**         Step 2 causes the checkpointer to wait until all existing region A
**         readers have finished their transactions. Once the exclusive lock
**         on region A has been obtained, only "region D" readers exist.
**         These readers are operating on the snapshot at the head of the
**         log. As such, the log can be safely copied into the database file
**         without interfering with the readers.
**
**         Once the checkpoint has finished and the log-summary header
**         updated (to indicate the log contents can now be ignored), all
**         locks are released.
**
**         However, there may still exist region D readers using data in
**         the body of the log file, so the log file itself cannot be
**         truncated or overwritten until all region D readers have finished.
**         That requirement is satisfied, because writers (the clients that
**         write to the log file) require an exclusive lock on region D.
**         Which they cannot get until all region D readers have finished.
*/
#define LOG_LOCK_MUTEX  12
#define LOG_LOCK_DMH    13
#define LOG_LOCK_REGION 14

/*
** The four lockable regions associated with each log-summary. A connection
** may take either a SHARED or EXCLUSIVE lock on each. An ORed combination
** of the following bitmasks is passed as the second argument to the
** logLockRegion() function.
*/
#define LOG_REGION_A 0x01
#define LOG_REGION_B 0x02
#define LOG_REGION_C 0x04
#define LOG_REGION_D 0x08

/*
** A single instance of this structure is allocated as part of each
** connection to a database log. All structures associated with the
** same log file are linked together into a list using LogLock.pNext
** starting at LogSummary.pLock.
**
** The mLock field of the structure describes the locks (if any)
** currently held by the connection. If a SHARED lock is held on
** any of the four locking regions, then the associated LOG_REGION_X
** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
** then the (LOG_REGION_X << 8) bit is set.
*/
struct LogLock {
  LogLock *pNext;                 /* Next lock on the same log */
  u32 mLock;                      /* Mask of locks */
};

struct Log {
  LogSummary *pSummary;           /* Log file summary data */
  sqlite3_vfs *pVfs;              /* The VFS used to create pFd */
  sqlite3_file *pFd;              /* File handle for log file */
  int isLocked;                   /* Non-zero if a snapshot is held open */
  int isWriteLocked;              /* True if this is the writer connection */
  LogSummaryHdr hdr;              /* Log summary header for current snapshot */
  LogLock lock;                   /* Lock held by this connection (if any) */
};


/*
** This structure is used to implement an iterator that iterates through
** all frames in the log in database page order. Where two or more frames
** correspond to the same database page, the iterator visits only the
** frame most recently written to the log.
**
** The internals of this structure are only accessed by:
**
**   logIteratorInit() - Create a new iterator,
**   logIteratorNext() - Step an iterator,
**   logIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see logCheckpoint()).
*/
struct LogIterator {
  int nSegment;                   /* Size of LogIterator.aSegment[] array */
  int nFinal;                     /* Elements in segment nSegment-1 */
  struct LogSegment {
    int iNext;                    /* Next aIndex index */
    u8 *aIndex;                   /* Pointer to index array */
    u32 *aDbPage;                 /* Pointer to db page array */
  } aSegment[1];
};


/*
** List of all LogSummary objects created by this process. Protected by
** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
** here instead of borrowing the LRU mutex.
*/
#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
static LogSummary *pLogSummary = 0;

/*
** Generate an 8 byte checksum based on the data in array aByte[] and the
** initial values of aCksum[0] and aCksum[1]. The checksum is written into
** aCksum[] before returning.
*/
#define LOG_CKSM_BYTES 8
static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
  u64 sum1 = aCksum[0];
  u64 sum2 = aCksum[1];
  u32 *a32 = (u32 *)aByte;
  u32 *aEnd = (u32 *)&aByte[nByte];

  assert( LOG_CKSM_BYTES==2*sizeof(u32) );
  assert( (nByte&0x00000003)==0 );

  do {
    sum1 += (*a32++);
    sum2 += sum1;
  } while( a32<aEnd );

  aCksum[0] = sum1 + (sum1>>24);
  aCksum[1] = sum2 + (sum2>>24);
}

/*
** Argument zPath must be a nul-terminated string containing a path-name.
** This function modifies the string in-place by removing any "./" or "../"
** elements in the path. For example, the following input:
**
**   "/home/user/plans/good/../evil/./world_domination.txt"
**
** is overwritten with the 'normalized' version:
**
**   "/home/user/plans/evil/world_domination.txt"
*/
static void logNormalizePath(char *zPath){
  int i, j;
  char *z = zPath;
  int n = strlen(z);

  while( n>1 && z[n-1]=='/' ){ n--; }
  for(i=j=0; i<n; i++){
    if( z[i]=='/' ){
      if( z[i+1]=='/' ) continue;
      if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
        i += 1;
        continue;
      }
      if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
        while( j>0 && z[j-1]!='/' ){ j--; }
        if( j>0 ){ j--; }
        i += 2;
        continue;
      }
    }
    z[j++] = z[i];
  }
  z[j] = 0;
}

/*
** Memory map the first nByte bytes of the summary file opened with
** pSummary->fd at pSummary->aData. If the summary file is smaller than
** nByte bytes in size when this function is called, ftruncate() is
** used to expand it before it is mapped.
**
** It is assumed that an exclusive lock is held on the summary file
** by the caller (to protect the ftruncate()).
*/
static int logSummaryMap(LogSummary *pSummary, int nByte){
  struct stat sStat;
  int rc;
  int fd = pSummary->fd;
  void *pMap;

  assert( pSummary->aData==0 );

  /* If the file is less than nByte bytes in size, cause it to grow. */
  rc = fstat(fd, &sStat);
  if( rc!=0 ) return SQLITE_IOERR;
  if( sStat.st_size<nByte ){
    rc = ftruncate(fd, nByte);
    if( rc!=0 ) return SQLITE_IOERR;
  }

  /* Map the file. */
  pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
  if( pMap==MAP_FAILED ){
    return SQLITE_IOERR;
  }
  pSummary->aData = (u32 *)pMap;
  pSummary->nData = nByte;

  return SQLITE_OK;
}

/*
** Unmap the log-summary mapping and close the file-descriptor. If
** the isTruncate argument is non-zero, truncate the log-summary file
** region to zero bytes.
**
** Regardless of the value of isTruncate, close the file-descriptor
** opened on the log-summary file.
*/
static int logSummaryUnmap(LogSummary *pSummary, int isUnlink){
  int rc = SQLITE_OK;
  if( pSummary->aData ){
    assert( pSummary->fd>0 );
    munmap(pSummary->aData, pSummary->nData);
    pSummary->aData = 0;
    if( isUnlink ){
      char *zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
      if( !zFile ){
        rc = SQLITE_NOMEM;
      }
      unlink(zFile);
      sqlite3_free(zFile);
    }
  }
  if( pSummary->fd>0 ){
    close(pSummary->fd);
    pSummary->fd = -1;
  }
  return rc;
}

static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
  u32 *aData = pSummary->aData;
  memcpy(aData, pHdr, sizeof(LogSummaryHdr));
  aData[LOGSUMMARY_HDR_NFIELD] = 1;
  aData[LOGSUMMARY_HDR_NFIELD+1] = 1;
  logChecksumBytes(
    (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD]
  );
}

/*
** This function encodes a single frame header and writes it to a buffer
** supplied by the caller. A log frame-header is made up of a series of
** 4-byte big-endian integers, as follows:
**
**     0: Database page size in bytes.
**     4: Page number.
**     8: New database size (for commit frames, otherwise zero).
**    12: Frame checksum 1.
**    16: Frame checksum 2.
*/
static void logEncodeFrame(
  u32 *aCksum,                    /* IN/OUT: Checksum values */
  u32 iPage,                      /* Database page number for frame */
  u32 nTruncate,                  /* New db size (or 0 for non-commit frames) */
  int nData,                      /* Database page size (size of aData[]) */
  u8 *aData,                      /* Pointer to page data (for checksum) */
  u8 *aFrame                      /* OUT: Write encoded frame here */
){
  assert( LOG_FRAME_HDRSIZE==16 );

  sqlite3Put4byte(&aFrame[0], iPage);
  sqlite3Put4byte(&aFrame[4], nTruncate);

  logChecksumBytes(aFrame, 8, aCksum);
  logChecksumBytes(aData, nData, aCksum);

  sqlite3Put4byte(&aFrame[8], aCksum[0]);
  sqlite3Put4byte(&aFrame[12], aCksum[1]);
}

/*
** Return 1 and populate *piPage, *pnTruncate and aCksum if the
** frame checksum looks Ok. Otherwise return 0.
*/
static int logDecodeFrame(
  u32 *aCksum,                    /* IN/OUT: Checksum values */
  u32 *piPage,                    /* OUT: Database page number for frame */
  u32 *pnTruncate,                /* OUT: New db size (or 0 if not commit) */
  int nData,                      /* Database page size (size of aData[]) */
  u8 *aData,                      /* Pointer to page data (for checksum) */
  u8 *aFrame                      /* Frame data */
){
  assert( LOG_FRAME_HDRSIZE==16 );

  logChecksumBytes(aFrame, 8, aCksum);
  logChecksumBytes(aData, nData, aCksum);

  if( aCksum[0]!=sqlite3Get4byte(&aFrame[8])
   || aCksum[1]!=sqlite3Get4byte(&aFrame[12])
  ){
    /* Checksum failed. */
    return 0;
  }

  *piPage = sqlite3Get4byte(&aFrame[0]);
  *pnTruncate = sqlite3Get4byte(&aFrame[4]);
  return 1;
}

static void logMergesort8(
  Pgno *aContent,                 /* Pages in log */
  u8 *aBuffer,                    /* Buffer of at least *pnList items to use */
  u8 *aList,                      /* IN/OUT: List to sort */
  int *pnList                     /* IN/OUT: Number of elements in aList[] */
){
  int nList = *pnList;
  if( nList>1 ){
    int nLeft = nList / 2;        /* Elements in left list */
    int nRight = nList - nLeft;   /* Elements in right list */
    u8 *aLeft = aList;            /* Left list */
    u8 *aRight = &aList[nLeft];   /* Right list */
    int iLeft = 0;                /* Current index in aLeft */
    int iRight = 0;               /* Current index in aright */
    int iOut = 0;                 /* Current index in output buffer */

    /* TODO: Change to non-recursive version. */
    logMergesort8(aContent, aBuffer, aLeft, &nLeft);
    logMergesort8(aContent, aBuffer, aRight, &nRight);

    while( iRight<nRight || iLeft<nLeft ){
      u8 logpage;
      Pgno dbpage;

      if( (iLeft<nLeft)
       && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
      ){
        logpage = aLeft[iLeft++];
      }else{
        logpage = aRight[iRight++];
      }
      dbpage = aContent[logpage];

      aBuffer[iOut++] = logpage;
      if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;

      assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
      assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
    }
    memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
    *pnList = iOut;
  }

#ifdef SQLITE_DEBUG
  {
    int i;
    for(i=1; i<*pnList; i++){
      assert( aContent[aList[i]] > aContent[aList[i-1]] );
    }
  }
#endif
}


/*
** Return the index in the LogSummary.aData array that corresponds to
** frame iFrame. The log-summary file consists of a header, followed by
** alternating "map" and "index" blocks.
*/
static int logSummaryEntry(u32 iFrame){
  return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD);
}


/*
** Set an entry in the log-summary map to map log frame iFrame to db
** page iPage. Values are always appended to the log-summary (i.e. the
** value of iFrame is always exactly one more than the value passed to
** the previous call), but that restriction is not enforced or asserted
** here.
*/
static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
  u32 iSlot = logSummaryEntry(iFrame);

  /* Set the log-summary entry itself */
  pSummary->aData[iSlot] = iPage;

  /* If the frame number is a multiple of 256 (frames are numbered starting
  ** at 1), build an index of the most recently added 256 frames.
  */
  if( (iFrame&0x000000FF)==0 ){
    int i;                        /* Iterator used while initializing aIndex */
    u32 *aFrame;                  /* Pointer to array of 256 frames */
    int nIndex;                   /* Number of entries in index */
    u8 *aIndex;                   /* 256 bytes to build index in */
    u8 *aTmp;                     /* Scratch space to use while sorting */

    aFrame = &pSummary->aData[iSlot-255];
    aIndex = (u8 *)&pSummary->aData[iSlot+1];
    aTmp = &aIndex[256];

    nIndex = 256;
    for(i=0; i<256; i++) aIndex[i] = (u8)i;
    logMergesort8(aFrame, aTmp, aIndex, &nIndex);
    memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
  }
}


/*
** Recover the log-summary by reading the log file. The caller must hold
** an exclusive lock on the log-summary file.
*/
static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
  int rc;                         /* Return Code */
  i64 nSize;                      /* Size of log file */
  LogSummaryHdr hdr;              /* Recovered log-summary header */

  memset(&hdr, 0, sizeof(hdr));

  rc = sqlite3OsFileSize(pFd, &nSize);
  if( rc!=SQLITE_OK ){
    return rc;
  }

  if( nSize>LOG_FRAME_HDRSIZE ){
    u8 aBuf[LOG_FRAME_HDRSIZE];   /* Buffer to load first frame header into */
    u8 *aFrame = 0;               /* Malloc'd buffer to load entire frame */
    int nFrame;                   /* Number of bytes at aFrame */
    u8 *aData;                    /* Pointer to data part of aFrame buffer */
    int iFrame;                   /* Index of last frame read */
    i64 iOffset;                  /* Next offset to read from log file */
    int nPgsz;                    /* Page size according to the log */
    u32 aCksum[2];                /* Running checksum */

    /* Read in the first frame header in the file (to determine the
    ** database page size).
    */
    rc = sqlite3OsRead(pFd, aBuf, LOG_HDRSIZE, 0);
    if( rc!=SQLITE_OK ){
      return rc;
    }

    /* If the database page size is not a power of two, or is greater than
    ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
    */
    nPgsz = sqlite3Get4byte(&aBuf[0]);
    if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){
      goto finished;
    }
    aCksum[0] = sqlite3Get4byte(&aBuf[4]);
    aCksum[1] = sqlite3Get4byte(&aBuf[8]);

    /* Malloc a buffer to read frames into. */
    nFrame = nPgsz + LOG_FRAME_HDRSIZE;
    aFrame = (u8 *)sqlite3_malloc(nFrame);
    if( !aFrame ){
      return SQLITE_NOMEM;
    }
    aData = &aFrame[LOG_FRAME_HDRSIZE];

    /* Read all frames from the log file. */
    iFrame = 0;
    for(iOffset=LOG_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
      u32 pgno;                   /* Database page number for frame */
      u32 nTruncate;              /* dbsize field from frame header */
      int isValid;                /* True if this frame is valid */

      /* Read and decode the next log frame. */
      rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
      if( rc!=SQLITE_OK ) break;
      isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
      if( !isValid ) break;
      logSummaryAppend(pSummary, ++iFrame, pgno);

      /* If nTruncate is non-zero, this is a commit record. */
      if( nTruncate ){
        hdr.iCheck1 = aCksum[0];
        hdr.iCheck2 = aCksum[1];
        hdr.iLastPg = iFrame;
        hdr.nPage = nTruncate;
        hdr.pgsz = nPgsz;
      }
    }

    sqlite3_free(aFrame);
  }else{
    hdr.iCheck1 = 2;
    hdr.iCheck2 = 3;
  }

finished:
  logSummaryWriteHdr(pSummary, &hdr);
  return rc;
}

/*
** Values for the third parameter to logLockRegion().
*/
#define LOG_UNLOCK  0
#define LOG_RDLOCK  1
#define LOG_WRLOCK  2
#define LOG_WRLOCKW 3

static int logLockFd(LogSummary *pSummary, int iStart, int nByte, int op){
  int aType[4] = {
    F_UNLCK,                    /* LOG_UNLOCK */
    F_RDLCK,                    /* LOG_RDLOCK */
    F_WRLCK,                    /* LOG_WRLOCK */
    F_WRLCK                     /* LOG_WRLOCKW */
  };
  int aOp[4] = {
    F_SETLK,                    /* LOG_UNLOCK */
    F_SETLK,                    /* LOG_RDLOCK */
    F_SETLK,                    /* LOG_WRLOCK */
    F_SETLKW                    /* LOG_WRLOCKW */
  };

  struct flock f;               /* Locking operation */
  int rc;                       /* Value returned by fcntl() */

  assert( ArraySize(aType)==ArraySize(aOp) );
  assert( op>=0 && op<ArraySize(aType) );

  memset(&f, 0, sizeof(f));
  f.l_type = aType[op];
  f.l_whence = SEEK_SET;
  f.l_start = iStart;
  f.l_len = nByte;
  rc = fcntl(pSummary->fd, aOp[op], &f);
  return (rc==0) ? SQLITE_OK : SQLITE_BUSY;
}

static int logLockRegion(Log *pLog, u32 mRegion, int op){
  LogSummary *pSummary = pLog->pSummary;
  LogLock *p;                     /* Used to iterate through in-process locks */
  u32 mOther;                     /* Locks held by other connections */
  u32 mNew;                       /* New mask for pLog */

  assert(
       /* Writer lock operations */
          (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
       || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))

       /* Normal reader lock operations */
       || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
       || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
       || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))

       /* Region D reader lock operations */
       || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
       || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A))
       || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))

       /* Checkpointer lock operations */
       || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
       || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
       || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
       || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
  );

  /* Assert that a connection never tries to go from an EXCLUSIVE to a
  ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
  ** happens though (when a region D reader upgrades to a writer).
  */
  assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );

  sqlite3_mutex_enter(pSummary->mutex);

  /* Calculate a mask of logs held by all connections in this process apart
  ** from this one. The least significant byte of the mask contains a mask
  ** of the SHARED logs held. The next least significant byte of the mask
  ** indicates the EXCLUSIVE locks held. For example, to test if some other
  ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
  ** on region C, do:
  **
  **   hasSharedOnA    = (mOther & (LOG_REGION_A<<0));
  **   hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
  **
  ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
  ** corresponding bit in the SHARED mask.
  */
  mOther = 0;
  for(p=pSummary->pLock; p; p=p->pNext){
    assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
    if( p!=&pLog->lock ){
      mOther |= p->mLock;
    }
  }

  /* If this call is to lock a region (not to unlock one), test if locks held
  ** by any other connection in this process prevent the new locks from
  ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
  */
  if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
    sqlite3_mutex_leave(pSummary->mutex);
    return SQLITE_BUSY;
  }

  /* Figure out the new log mask for this connection. */
  switch( op ){
    case LOG_UNLOCK:
      mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
      break;
    case LOG_RDLOCK:
      mNew = (pLog->lock.mLock | mRegion);
      break;
    default:
      assert( op==LOG_WRLOCK );
      mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
      break;
  }

  /* Now modify the locks held on the log-summary file descriptor. This
  ** file descriptor is shared by all log connections in this process.
  ** Therefore:
  **
  **   + If one or more log connections in this process hold a SHARED lock
  **     on a region, the file-descriptor should hold a SHARED lock on
  **     the file region.
  **
  **   + If a log connection in this process holds an EXCLUSIVE lock on a
  **     region, the file-descriptor should also hold an EXCLUSIVE lock on
  **     the region in question.
  **
  ** If this is an LOG_UNLOCK operation, only regions for which no other
  ** connection holds a lock should actually be unlocked. And if this
  ** is a LOG_RDLOCK operation and other connections already hold all
  ** the required SHARED locks, then no system call is required.
  */
  if( op==LOG_UNLOCK ){
    mRegion = (mRegion & ~mOther);
  }
  if( (op==LOG_WRLOCK)
   || (op==LOG_UNLOCK && mRegion)
   || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
  ){
    struct LockMap {
      int iStart;                 /* Byte offset to start locking operation */
      int iLen;                   /* Length field for locking operation */
    } aMap[] = {
      /* 0000 */ {0, 0},                    /* 0001 */ {4+LOG_LOCK_REGION, 1},
      /* 0010 */ {3+LOG_LOCK_REGION, 1},    /* 0011 */ {3+LOG_LOCK_REGION, 2},
      /* 0100 */ {2+LOG_LOCK_REGION, 1},    /* 0101 */ {0, 0},
      /* 0110 */ {2+LOG_LOCK_REGION, 2},    /* 0111 */ {2+LOG_LOCK_REGION, 3},
      /* 1000 */ {1+LOG_LOCK_REGION, 1},    /* 1001 */ {0, 0},
      /* 1010 */ {0, 0},                    /* 1011 */ {0, 0},
      /* 1100 */ {1+LOG_LOCK_REGION, 2},    /* 1101 */ {0, 0},
      /* 1110 */ {0, 0},                    /* 1111 */ {0, 0}
    };
    int rc;                       /* Return code of logLockFd() */

    assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );

    rc = logLockFd(pSummary, aMap[mRegion].iStart, aMap[mRegion].iLen, op);
    if( rc!=0 ){
      sqlite3_mutex_leave(pSummary->mutex);
      return rc;
    }
  }

  pLog->lock.mLock = mNew;
  sqlite3_mutex_leave(pSummary->mutex);
  return SQLITE_OK;
}

static int logLockDMH(LogSummary *pSummary, int eLock){
  assert( eLock==LOG_RDLOCK || eLock==LOG_WRLOCK );
  return logLockFd(pSummary, LOG_LOCK_DMH, 1, eLock);
}

static int logLockMutex(LogSummary *pSummary, int eLock){
  assert( eLock==LOG_WRLOCKW || eLock==LOG_UNLOCK );
  logLockFd(pSummary, LOG_LOCK_MUTEX, 1, eLock);
  return SQLITE_OK;
}


/*
** This function intializes the connection to the log-summary identified
** by struct pSummary.
*/
static int logSummaryInit(
  LogSummary *pSummary,           /* Log summary object to initialize */
  sqlite3_file *pFd               /* File descriptor open on log file */
){
  int rc;                         /* Return Code */
  char *zFile;                    /* File name for summary file */

  assert( pSummary->fd<0 );
  assert( pSummary->aData==0 );
  assert( pSummary->nRef>0 );
  assert( pSummary->zPath );

  /* Open a file descriptor on the summary file. */
  zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
  if( !zFile ){
    return SQLITE_NOMEM;
  }
  pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
  sqlite3_free(zFile);
  if( pSummary->fd<0 ){
    return SQLITE_IOERR;
  }

  /* Grab an exclusive lock the summary file. Then mmap() it.
  **
  ** TODO: This code needs to be enhanced to support a growable mapping.
  ** For now, just make the mapping very large to start with. The
  ** pages should not be allocated until they are first accessed anyhow,
  ** so using a large mapping consumes no more resources than a smaller
  ** one would.
  */
  assert( sqlite3_mutex_held(pSummary->mutex) );
  rc = logLockMutex(pSummary, LOG_WRLOCKW);
  if( rc!=SQLITE_OK ) return rc;
  rc = logSummaryMap(pSummary, 512*1024);
  if( rc!=SQLITE_OK ) goto out;

  /* Try to obtain an EXCLUSIVE lock on the dead-mans-hand region. If this
  ** is possible, the contents of the log-summary file (if any) may not
  ** be trusted. Zero the log-summary header before continuing.
  */
  rc = logLockDMH(pSummary, LOG_WRLOCK);
  if( rc==SQLITE_OK ){
    memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
  }
  rc = logLockDMH(pSummary, LOG_RDLOCK);
  if( rc!=SQLITE_OK ){
    return SQLITE_IOERR;
  }

 out:
  logLockMutex(pSummary, LOG_UNLOCK);
  return rc;
}

/*
** Open a connection to the log file associated with database zDb. The
** database file does not actually have to exist. zDb is used only to
** figure out the name of the log file to open. If the log file does not
** exist it is created by this call.
**
** A SHARED lock should be held on the database file when this function
** is called. The purpose of this SHARED lock is to prevent any other
** client from unlinking the log or log-summary file. If another process
** were to do this just after this client opened one of these files, the
** system would be badly broken.
*/
int sqlite3LogOpen(
  sqlite3_vfs *pVfs,              /* vfs module to open log file with */
  const char *zDb,                /* Name of database file */
  Log **ppLog                     /* OUT: Allocated Log handle */
){
  int rc = SQLITE_OK;             /* Return Code */
  Log *pRet;                      /* Object to allocate and return */
  LogSummary *pSummary = 0;       /* Summary object */
  sqlite3_mutex *mutex = 0;       /* LOG_SUMMARY_MUTEX mutex */
  int flags;                      /* Flags passed to OsOpen() */
  char *zWal = 0;                 /* Path to WAL file */
  int nWal;                       /* Length of zWal in bytes */

  assert( zDb );

  /* Allocate an instance of struct Log to return. */
  *ppLog = 0;
  pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
  if( !pRet ) goto out;
  pRet->pVfs = pVfs;
  pRet->pFd = (sqlite3_file *)&pRet[1];

  /* Normalize the path name. */
  zWal = sqlite3_mprintf("%s-wal", zDb);
  if( !zWal ) goto out;
  logNormalizePath(zWal);
  flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
  nWal = sqlite3Strlen30(zWal);

  /* Enter the mutex that protects the linked-list of LogSummary structures */
  if( sqlite3GlobalConfig.bCoreMutex ){
    mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
  }
  sqlite3_mutex_enter(mutex);

  /* Search for an existing log summary object in the linked list. If one
  ** cannot be found, allocate and initialize a new object.
  */
  for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
    int nPath = sqlite3Strlen30(pSummary->zPath);
    if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
  }
  if( !pSummary ){
    int nByte = sizeof(LogSummary) + nWal + 1;
    pSummary = (LogSummary *)sqlite3MallocZero(nByte);
    if( !pSummary ){
      rc = SQLITE_NOMEM;
      goto out;
    }
    if( sqlite3GlobalConfig.bCoreMutex ){
      pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
    }
    pSummary->zPath = (char *)&pSummary[1];
    pSummary->fd = -1;
    memcpy(pSummary->zPath, zWal, nWal);
    pSummary->pNext = pLogSummary;
    pLogSummary = pSummary;
  }
  pSummary->nRef++;
  pRet->pSummary = pSummary;

  /* Exit the mutex protecting the linked-list of LogSummary objects. */
  sqlite3_mutex_leave(mutex);
  mutex = 0;

  /* Open file handle on the log file. */
  rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
  if( rc!=SQLITE_OK ) goto out;

  /* Object pSummary is shared between all connections to the database made
  ** by this process. So at this point it may or may not be connected to
  ** the log-summary. If it is not, connect it.
  */
  sqlite3_mutex_enter(pSummary->mutex);
  mutex = pSummary->mutex;
  if( pSummary->fd<0 ){
    rc = logSummaryInit(pSummary, pRet->pFd);
  }

  pRet->lock.pNext = pSummary->pLock;
  pSummary->pLock = &pRet->lock;

 out:
  sqlite3_mutex_leave(mutex);
  sqlite3_free(zWal);
  if( rc!=SQLITE_OK ){
    assert(0);
    if( pRet ){
      sqlite3OsClose(pRet->pFd);
      sqlite3_free(pRet);
    }
    assert( !pSummary || pSummary->nRef==0 );
    sqlite3_free(pSummary);
  }
  *ppLog = pRet;
  return rc;
}

static int logIteratorNext(
  LogIterator *p,               /* Iterator */
  u32 *piPage,                    /* OUT: Next db page to write */
  u32 *piFrame                    /* OUT: Log frame to read from */
){
  u32 iMin = *piPage;
  u32 iRet = 0xFFFFFFFF;
  int i;
  int nBlock = p->nFinal;

  for(i=p->nSegment-1; i>=0; i--){
    struct LogSegment *pSegment = &p->aSegment[i];
    while( pSegment->iNext<nBlock ){
      u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
      if( iPg>iMin ){
        if( iPg<iRet ){
          iRet = iPg;
          *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
        }
        break;
      }
      pSegment->iNext++;
    }

    nBlock = 256;
  }

  *piPage = iRet;
  return (iRet==0xFFFFFFFF);
}

static LogIterator *logIteratorInit(Log *pLog){
  u32 *aData = pLog->pSummary->aData;
  LogIterator *p;                 /* Return value */
  int nSegment;                   /* Number of segments to merge */
  u32 iLast;                      /* Last frame in log */
  int nByte;                      /* Number of bytes to allocate */
  int i;                          /* Iterator variable */
  int nFinal;                     /* Number of unindexed entries */
  struct LogSegment *pFinal;      /* Final (unindexed) segment */
  u8 *aTmp;                       /* Temp space used by merge-sort */

  iLast = pLog->hdr.iLastPg;
  nSegment = (iLast >> 8) + 1;
  nFinal = (iLast & 0x000000FF);

  nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512;
  p = (LogIterator *)sqlite3_malloc(nByte);
  if( p ){
    memset(p, 0, nByte);
    p->nSegment = nSegment;
    p->nFinal = nFinal;
  }

  for(i=0; i<nSegment-1; i++){
    p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
    p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
  }
  pFinal = &p->aSegment[nSegment-1];

  pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
  pFinal->aIndex = (u8 *)&pFinal[1];
  aTmp = &pFinal->aIndex[256];
  for(i=0; i<nFinal; i++){
    pFinal->aIndex[i] = i;
  }
  logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
  p->nFinal = nFinal;

  return p;
}

/*
** Free a log iterator allocated by logIteratorInit().
*/
static void logIteratorFree(LogIterator *p){
  sqlite3_free(p);
}

/*
** Checkpoint the contents of the log file.
*/
static int logCheckpoint(
  Log *pLog,                      /* Log connection */
  sqlite3_file *pFd,              /* File descriptor open on db file */
  int sync_flags,                 /* Flags for OsSync() (or 0) */
  u8 *zBuf                        /* Temporary buffer to use */
){
  int rc;                         /* Return code */
  int pgsz = pLog->hdr.pgsz;      /* Database page-size */
  LogIterator *pIter = 0;         /* Log iterator context */
  u32 iDbpage = 0;                /* Next database page to write */
  u32 iFrame = 0;                 /* Log frame containing data for iDbpage */

  if( pLog->hdr.iLastPg==0 ){
    return SQLITE_OK;
  }

  /* Allocate the iterator */
  pIter = logIteratorInit(pLog);
  if( !pIter ) return SQLITE_NOMEM;

  /* Sync the log file to disk */
  if( sync_flags ){
    rc = sqlite3OsSync(pLog->pFd, sync_flags);
    if( rc!=SQLITE_OK ) goto out;
  }

  /* Iterate through the contents of the log, copying data to the db file. */
  while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){
    rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
        logFrameOffset(iFrame, pgsz) + LOG_FRAME_HDRSIZE
    );
    if( rc!=SQLITE_OK ) goto out;
    rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
    if( rc!=SQLITE_OK ) goto out;
  }

  /* Truncate the database file */
  rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
  if( rc!=SQLITE_OK ) goto out;

  /* Sync the database file. If successful, update the log-summary. */
  if( sync_flags ){
    rc = sqlite3OsSync(pFd, sync_flags);
    if( rc!=SQLITE_OK ) goto out;
  }
  pLog->hdr.iLastPg = 0;
  pLog->hdr.iCheck1 = 2;
  pLog->hdr.iCheck2 = 3;
  logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);

  /* TODO: If a crash occurs and the current log is copied into the
  ** database there is no problem. However, if a crash occurs while
  ** writing the next transaction into the start of the log, such that:
  **
  **   * The first transaction currently in the log is left intact, but
  **   * The second (or subsequent) transaction is damaged,
  **
  ** then the database could become corrupt.
  **
  ** The easiest thing to do would be to write and sync a dummy header
  ** into the log at this point. Unfortunately, that turns out to be
  ** an unwelcome performance hit. Alternatives are...
  */
#if 0
  memset(zBuf, 0, LOG_FRAME_HDRSIZE);
  rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
  if( rc!=SQLITE_OK ) goto out;
  rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
#endif

 out:
  logIteratorFree(pIter);
  return rc;
}

/*
** Close a connection to a log file.
*/
int sqlite3LogClose(
  Log *pLog,                      /* Log to close */
  sqlite3_file *pFd,              /* Database file */
  int sync_flags,                 /* Flags to pass to OsSync() (or 0) */
  u8 *zBuf                        /* Buffer of at least page-size bytes */
){
  int rc = SQLITE_OK;
  if( pLog ){
    LogLock **ppL;
    LogSummary *pSummary = pLog->pSummary;
    sqlite3_mutex *mutex = 0;

    sqlite3_mutex_enter(pSummary->mutex);
    for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
    *ppL = pLog->lock.pNext;
    sqlite3_mutex_leave(pSummary->mutex);

    if( sqlite3GlobalConfig.bCoreMutex ){
      mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
    }
    sqlite3_mutex_enter(mutex);

    /* Decrement the reference count on the log summary. If this is the last
    ** reference to the log summary object in this process, the object will
    ** be freed. If this is also the last connection to the database, then
    ** checkpoint the database and truncate the log and log-summary files
    ** to zero bytes in size.
    **/
    pSummary->nRef--;
    if( pSummary->nRef==0 ){
      int rc;
      LogSummary **pp;
      for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
      *pp = (*pp)->pNext;

      sqlite3_mutex_leave(mutex);

      rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
      if( rc==SQLITE_OK ){

        /* This is the last connection to the database (including other
        ** processes). Do three things:
        **
        **   1. Checkpoint the db.
        **   2. Truncate the log file.
        **   3. Unlink the log-summary file.
        */
        rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
        if( rc==SQLITE_OK ){
          rc = sqlite3OsDelete(pLog->pVfs, pSummary->zPath, 0);
        }

        logSummaryUnmap(pSummary, 1);
      }else{
        if( rc==SQLITE_BUSY ){
          rc = SQLITE_OK;
        }
        logSummaryUnmap(pSummary, 0);
      }
      sqlite3OsUnlock(pFd, SQLITE_LOCK_NONE);

      sqlite3_mutex_free(pSummary->mutex);
      sqlite3_free(pSummary);
    }else{
      sqlite3_mutex_leave(mutex);
    }

    /* Close the connection to the log file and free the Log handle. */
    sqlite3OsClose(pLog->pFd);
    sqlite3_free(pLog);
  }
  return rc;
}

/*
** Enter and leave the log-summary mutex. In this context, entering the
** log-summary mutex means:
**
**   1. Obtaining mutex pLog->pSummary->mutex, and
**   2. Taking an exclusive lock on the log-summary file.
**
** i.e. this mutex locks out other processes as well as other threads
** hosted in this address space.
*/
static int logEnterMutex(Log *pLog){
  LogSummary *pSummary = pLog->pSummary;
  int rc;

  sqlite3_mutex_enter(pSummary->mutex);
  rc = logLockMutex(pSummary, LOG_WRLOCKW);
  if( rc!=SQLITE_OK ){
    sqlite3_mutex_leave(pSummary->mutex);
  }
  return rc;
}
static void logLeaveMutex(Log *pLog){
  LogSummary *pSummary = pLog->pSummary;
  logLockMutex(pSummary, LOG_UNLOCK);
  sqlite3_mutex_leave(pSummary->mutex);
}

/*
** Try to read the log-summary header. Attempt to verify the header
** checksum. If the checksum can be verified, copy the log-summary
** header into structure pLog->hdr. If the contents of pLog->hdr are
** modified by this and pChanged is not NULL, set *pChanged to 1.
** Otherwise leave *pChanged unmodified.
**
** If the checksum cannot be verified return SQLITE_ERROR.
*/
int logSummaryTryHdr(Log *pLog, int *pChanged){
  u32 aCksum[2] = {1, 1};
  u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];

  /* First try to read the header without a lock. Verify the checksum
  ** before returning. This will almost always work.
  */
  memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
  logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
  if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
   || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
  ){
    return SQLITE_ERROR;
  }

  if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
    if( pChanged ){
      *pChanged = 1;
    }
    memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
  }
  return SQLITE_OK;
}

/*
** Read the log-summary header from the log-summary file into structure
** pLog->hdr. If attempting to verify the header checksum fails, try
** to recover the log before returning.
**
** If the log-summary header is successfully read, return SQLITE_OK.
** Otherwise an SQLite error code.
*/
int logSummaryReadHdr(Log *pLog, int *pChanged){
  int rc;

  /* First try to read the header without a lock. Verify the checksum
  ** before returning. This will almost always work.
  */
  if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
    return SQLITE_OK;
  }

  /* If the first attempt to read the header failed, lock the log-summary
  ** file and try again. If the header checksum verification fails this
  ** time as well, run log recovery.
  */
  if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
    if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
      if( pChanged ){
        *pChanged = 1;
      }
      rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
      if( rc==SQLITE_OK ){
        rc = logSummaryTryHdr(pLog, 0);
      }
    }
    logLeaveMutex(pLog);
  }

  return rc;
}

/*
** Lock a snapshot.
**
** If this call obtains a new read-lock and the database contents have been
** modified since the most recent call to LogCloseSnapshot() on this Log
** connection, then *pChanged is set to 1 before returning. Otherwise, it
** is left unmodified. This is used by the pager layer to determine whether
** or not any cached pages may be safely reused.
*/
int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){
  int rc = SQLITE_OK;
  if( pLog->isLocked==0 ){
    int nAttempt;

    /* Obtain a snapshot-lock on the log-summary file. The procedure
    ** for obtaining the snapshot log is:
    **
    **    1. Attempt a SHARED lock on regions A and B.
    **    2a. If step 1 is successful, drop the lock on region B.
    **    2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
    **    3. Repeat the above until the lock attempt in step 1 or 2b is
    **       successful.
    **
    ** If neither of the locks can be obtained after 5 tries, presumably
    ** something is wrong (i.e. a process not following the locking protocol).
    ** Return an error code in this case.
    */
    rc = SQLITE_BUSY;
    for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
      rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
      if( rc==SQLITE_BUSY ){
        rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
        if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
      }else{
        logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
        pLog->isLocked = LOG_REGION_A;
      }
    }
    if( rc!=SQLITE_OK ){
      return rc;
    }

    rc = logSummaryReadHdr(pLog, pChanged);
    if( rc!=SQLITE_OK ){
      /* An error occured while attempting log recovery. */
      sqlite3LogCloseSnapshot(pLog);
    }
  }
  return rc;
}

/*
** Unlock the current snapshot.
*/
void sqlite3LogCloseSnapshot(Log *pLog){
  if( pLog->isLocked ){
    assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
    logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
  }
  pLog->isLocked = 0;
}

/*
** Read a page from the log, if it is present.
*/
int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
  u32 iRead = 0;
  u32 *aData = pLog->pSummary->aData;
  int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);

  assert( pLog->isLocked );

  /* Do a linear search of the unindexed block of page-numbers (if any)
  ** at the end of the log-summary. An alternative to this would be to
  ** build an index in private memory each time a read transaction is
  ** opened on a new snapshot.
  */
  if( pLog->hdr.iLastPg ){
    u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
    u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
    while( *pi!=pgno && pi!=piStop ) pi--;
    if( pi!=piStop ){
      iRead = (pi-piStop) + iFrame;
    }
  }
  assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );

  while( iRead==0 && iFrame>0 ){
    int iLow = 0;
    int iHigh = 255;
    u32 *aFrame;
    u8 *aIndex;

    iFrame -= 256;
    aFrame = &aData[logSummaryEntry(iFrame+1)];
    aIndex = (u8 *)&aFrame[256];

    while( iLow<=iHigh ){
      int iTest = (iLow+iHigh)>>1;
      u32 iPg = aFrame[aIndex[iTest]];

      if( iPg==pgno ){
        iRead = iFrame + 1 + aIndex[iTest];
        break;
      }
      else if( iPg<pgno ){
        iLow = iTest+1;
      }else{
        iHigh = iTest-1;
      }
    }
  }
  assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );

  /* If iRead is non-zero, then it is the log frame number that contains the
  ** required page. Read and return data from the log file.
  */
  if( iRead ){
    i64 iOffset = logFrameOffset(iRead, pLog->hdr.pgsz) + LOG_FRAME_HDRSIZE;
    *pInLog = 1;
    return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
  }

  *pInLog = 0;
  return SQLITE_OK;
}


/*
** Set *pPgno to the size of the database file (or zero, if unknown).
*/
void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){
  assert( pLog->isLocked );
  *pPgno = pLog->hdr.nPage;
}

/*
** This function returns SQLITE_OK if the caller may write to the database.
** Otherwise, if the caller is operating on a snapshot that has already
** been overwritten by another writer, SQLITE_BUSY is returned.
*/
int sqlite3LogWriteLock(Log *pLog, int op){
  assert( pLog->isLocked );
  if( op ){

    /* Obtain the writer lock */
    int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
    if( rc!=SQLITE_OK ){
      return rc;
    }

    /* If this is connection is a region D reader, then the SHARED lock on
    ** region D has just been upgraded to EXCLUSIVE. But no lock at all is
    ** held on region A. This means that if the write-transaction is committed
    ** and this connection downgrades to a reader, it will be left with no
    ** lock at all. And so its snapshot could get clobbered by a checkpoint
    ** operation.
    **
    ** To stop this from happening, grab a SHARED lock on region A now.
    ** This should always be successful, as the only time a client holds
    ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE
    ** lock on region C (a checkpointer does this). This is not possible,
    ** as this connection currently has the EXCLUSIVE lock on region C.
    */
    if( pLog->isLocked==LOG_REGION_D ){
      logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK);
      pLog->isLocked = LOG_REGION_A;
    }

    /* If this connection is not reading the most recent database snapshot,
    ** it is not possible to write to the database. In this case release
    ** the write locks and return SQLITE_BUSY.
    */
    if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
      logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
      return SQLITE_BUSY;
    }
    pLog->isWriteLocked = 1;

  }else if( pLog->isWriteLocked ){
    logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
    memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
    pLog->isWriteLocked = 0;
  }
  return SQLITE_OK;
}

/*
** Write a set of frames to the log. The caller must hold at least a
** RESERVED lock on the database file.
*/
int sqlite3LogFrames(
  Log *pLog,                      /* Log handle to write to */
  int nPgsz,                      /* Database page-size in bytes */
  PgHdr *pList,                   /* List of dirty pages to write */
  Pgno nTruncate,                 /* Database size after this commit */
  int isCommit,                   /* True if this is a commit */
  int sync_flags                  /* Flags to pass to OsSync() (or 0) */
){
  int rc;                         /* Used to catch return codes */
  u32 iFrame;                     /* Next frame address */
  u8 aFrame[LOG_FRAME_HDRSIZE];   /* Buffer to assemble frame-header in */
  PgHdr *p;                       /* Iterator to run through pList with. */
  u32 aCksum[2];                  /* Checksums */
  PgHdr *pLast;                   /* Last frame in list */
  int nLast = 0;                  /* Number of extra copies of last page */

  assert( LOG_FRAME_HDRSIZE==(4 * 2 + LOG_CKSM_BYTES) );
  assert( pList );

  /* If this is the first frame written into the log, write the log
  ** header to the start of the log file. See comments at the top of
  ** this file for a description of the log-header format.
  */
  assert( LOG_FRAME_HDRSIZE>=LOG_HDRSIZE );
  iFrame = pLog->hdr.iLastPg;
  if( iFrame==0 ){
    sqlite3Put4byte(aFrame, nPgsz);
    sqlite3_randomness(8, &aFrame[4]);
    pLog->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
    pLog->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]);
    rc = sqlite3OsWrite(pLog->pFd, aFrame, LOG_HDRSIZE, 0);
    if( rc!=SQLITE_OK ){
      return rc;
    }
  }

  aCksum[0] = pLog->hdr.iCheck1;
  aCksum[1] = pLog->hdr.iCheck2;

  /* Write the log file. */
  for(p=pList; p; p=p->pDirty){
    u32 nDbsize;                  /* Db-size field for frame header */
    i64 iOffset;                  /* Write offset in log file */

    iOffset = logFrameOffset(++iFrame, nPgsz);

    /* Populate and write the frame header */
    nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
    logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
    rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
    if( rc!=SQLITE_OK ){
      return rc;
    }

    /* Write the page data */
    rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
    if( rc!=SQLITE_OK ){
      return rc;
    }
    pLast = p;
  }

  /* Sync the log file if the 'isSync' flag was specified. */
  if( sync_flags ){
    i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
    i64 iOffset = logFrameOffset(iFrame+1, nPgsz);

    assert( isCommit );

    if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
      iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
    }
    iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
    while( iOffset<iSegment ){
      logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
      rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
      if( rc!=SQLITE_OK ){
        return rc;
      }

      iOffset += LOG_FRAME_HDRSIZE;
      rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
      if( rc!=SQLITE_OK ){
        return rc;
      }
      nLast++;
      iOffset += nPgsz;
    }

    rc = sqlite3OsSync(pLog->pFd, sync_flags);
    if( rc!=SQLITE_OK ){
      return rc;
    }
  }

  /* Append data to the log summary. It is not necessary to lock the
  ** log-summary to do this as the RESERVED lock held on the db file
  ** guarantees that there are no other writers, and no data that may
  ** be in use by existing readers is being overwritten.
  */
  iFrame = pLog->hdr.iLastPg;
  for(p=pList; p; p=p->pDirty){
    iFrame++;
    logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
  }
  while( nLast>0 ){
    iFrame++;
    nLast--;
    logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
  }

  /* Update the private copy of the header. */
  pLog->hdr.pgsz = nPgsz;
  pLog->hdr.iLastPg = iFrame;
  if( isCommit ){
    pLog->hdr.iChange++;
    pLog->hdr.nPage = nTruncate;
  }
  pLog->hdr.iCheck1 = aCksum[0];
  pLog->hdr.iCheck2 = aCksum[1];

  /* If this is a commit, update the log-summary header too. */
  if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
    logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
    logLeaveMutex(pLog);
  }

  return SQLITE_OK;
}

/*
** Checkpoint the database:
**
**   1. Wait for an EXCLUSIVE lock on regions B and C.
**   2. Wait for an EXCLUSIVE lock on region A.
**   3. Copy the contents of the log into the database file.
**   4. Zero the log-summary header (so new readers will ignore the log).
**   5. Drop the locks obtained in steps 1 and 2.
*/
int sqlite3LogCheckpoint(
  Log *pLog,                      /* Log connection */
  sqlite3_file *pFd,              /* File descriptor open on db file */
  int sync_flags,                 /* Flags to sync db file with (or 0) */
  u8 *zBuf,                       /* Temporary buffer to use */
  int (*xBusyHandler)(void *),    /* Pointer to busy-handler function */
  void *pBusyHandlerArg           /* Argument to pass to xBusyHandler */
){
  int rc;                         /* Return code */

  assert( !pLog->isLocked );

  /* Wait for an EXCLUSIVE lock on regions B and C. */
  do {
    rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
  }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
  if( rc!=SQLITE_OK ) return rc;

  /* Wait for an EXCLUSIVE lock on region A. */
  do {
    rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
  }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
  if( rc!=SQLITE_OK ){
    logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
    return rc;
  }

  /* Copy data from the log to the database file. */
  rc = logSummaryReadHdr(pLog, 0);
  if( rc==SQLITE_OK ){
    rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
  }

  /* Release the locks. */
  logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
  return rc;
}