Documentation of the new pager locking mechanism. (CVS 1570)

FossilOrigin-Name: 13cf1ba8256bf8cee0195dbaeac71a20cd2c2fc2
2004-06-11 17:48:02 +00:00 · 2004-06-11 17:48:02 +00:00 · 52619dfb7f
commit 52619dfb7f
parent ce2663ccc8
7 changed files with 664 additions and 40 deletions
--- a/main.mk
+++ b/main.mk
@ -419,6 +419,9 @@ index.html:	$(TOP)/www/index.tcl last_change
 lang.html:	$(TOP)/www/lang.tcl
 	tclsh $(TOP)/www/lang.tcl >lang.html

+lockingv3.html:	$(TOP)/www/lockingv3.tcl
+	tclsh $(TOP)/www/lockingv3.tcl >lockingv3.html
+
 omitted.html:	$(TOP)/www/omitted.tcl
 	tclsh $(TOP)/www/omitted.tcl >omitted.html

@ -475,6 +478,7 @@ DOC = \
  formatchng.html \
  index.html \
  lang.html \
+  lockingv3.html \
  mingw.html \
  nulls.html \
  omitted.html \
--- a/21
+++ b/21
@ -1,5 +1,5 @@
-C Have\sthe\svdbe\saggregator\suse\sa\sbtree\stable\sinstead\sof\sa\shash\stable.\s(CVS\s1569)
-D 2004-06-11T13:19:21
+C Documentation\sof\sthe\snew\spager\slocking\smechanism.\s(CVS\s1570)
+D 2004-06-11T17:48:03
 F Makefile.in ab7b0d5118e2da97bac66be8684a1034e3500f5a
 F Makefile.linux-gcc a9e5a0d309fa7c38e7c14d3ecf7690879d3a5457
 F README f1de682fbbd94899d50aca13d387d1b3fd3be2dd
@ -17,7 +17,7 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538
 F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac
 F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895
 F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826
-F main.mk 126f1ca2190fa19cc9944f54943cf3431891c330
+F main.mk e02f35c8354fe9a0ab48fc87b0bcb3fb7a86d713
 F mkdll.sh 68d34a961a1fdfa15ef27fc4f4740be583112124
 F publish.sh 2e579b7474d57b6debcead93c73a49eb8cb81718
 F spec.template a38492f1c1dd349fc24cb0565e08afc53045304b
@ -56,7 +56,7 @@ F src/printf.c 63b15f1ea9fe3daa066bb7430fd20d4a2d717dc8
 F src/random.c eff68e3f257e05e81eae6c4d50a51eb88beb4ff3
 F src/select.c d29488f86e61e0d45dff318e1f04ba6a7e5782d0
 F src/shell.c ca519519dcbbc582f6d88f7d0e7583b857fd3469
-F src/sqlite.h.in 2b6afe1de6935d3dfbd6042f46a62f1b7c3b3992
+F src/sqlite.h.in 56c53344a6fbd76328d641d9ddab90d4e4ba9129
 F src/sqliteInt.h 625faf4c9ce2f99b9c85a2bca5c4e73736c30262
 F src/table.c af14284fa36c8d41f6829e3f2819dce07d3e2de2
 F src/tclsqlite.c e974c0b2479ed37334aeb268de331e0a1b21b5a8
@ -194,7 +194,7 @@ F www/arch2.gif 49c6bb36160f03ca2b89eaa5bfb1f560c7d68ee7
 F www/audit.tcl 90e09d580f79c7efec0c7d6f447b7ec5c2dce5c0
 F www/c_interface.tcl 2176519fc2bd2d2cf6fe74fd806fc2d8362de2c8
 F www/capi3.tcl a940c5ca0b0ebafd5acfe2ceca5a388bd8cfb468
-F www/capi3ref.tcl 4248a45a0fd21fe0c116326300408d0999a028b5
+F www/capi3ref.tcl 2cb0861c5219c6e44298c217a6c904464b4403bc
 F www/changes.tcl cbe942607b2b9e5dc995395f37042dbb5a629c7f
 F www/common.tcl f786e6be86fb2627ceb30e770e9efa83b9c67a3a
 F www/conflict.tcl fb8a2ba83746c7fdfd9e52fa7f6aaf5c422b8246
@ -203,7 +203,7 @@ F www/copyright-release.pdf cfca3558fc97095e57c6117d08f1f5b80d95125a
 F www/copyright.tcl 82c9670c7ddb0311912ab7fe24703f33c531066c
 F www/datatype3.tcl f48b05cafd5e54ae5c05e643169d5217ee51a244
 F www/datatypes.tcl 566004b81c36877397ddbe6e1907aae6065f6b41
-F www/docs.tcl 0dcbf954907bd5dbfb7f1e0220f4e50516e07cd3
+F www/docs.tcl a924a5043973a70c58b65abf1051ed8b1238864d
 F www/download.tcl 8c84f15695c92cb01486930055fdf5192995f474
 F www/dynload.tcl 02eb8273aa78cfa9070dd4501dca937fb22b466c
 F www/faq.tcl 3a1776818d9bd973ab0c3048ec7ad6b1ad091ae5
@ -211,6 +211,7 @@ F www/fileformat.tcl f71a06a0d533c7df408539c64113b4adeaf29764
 F www/formatchng.tcl 7ed8a5c871ab105f01e5defe1822ec39f70675bb
 F www/index.tcl 64435502af780d7cd813365d443b9b9344662ce4
 F www/lang.tcl fc528581c4a406cabc138e2f17db5ef6f38615ff
+F www/lockingv3.tcl 1449cf3ff0249d6ed44e06b05244b423693d6265
 F www/mingw.tcl d96b451568c5d28545fefe0c80bee3431c73f69c
 F www/nulls.tcl f31330db8c978e675f5cd263067b32b822effa6f
 F www/omitted.tcl 7bd62b6f0f53b60c5360895b16b3af8407bbca03
@ -222,7 +223,7 @@ F www/support.tcl 1801397edd271cc39a2aadd54e701184b5181248
 F www/tclsqlite.tcl 19191cf2a1010eaeff74c51d83fd5f5a4d899075
 F www/vdbe.tcl 59288db1ac5c0616296b26dce071c36cb611dfe9
 F www/whentouse.tcl a8335bce47cc2fddb07f19052cb0cb4d9129a8e4
-P 66835ee67051027456a536e33b2f88a741654525
-R 17851c4c7a9d880622be1c17f4d88abf
-U danielk1977
-Z 37fd25919977ced5eda4bf17b983a171
+P 8d56118f64dbaf8c8006266fa7026f900a4a16bd
+R 452a2adcbc4985a31c30bc500c6b52fd
+U drh
+Z 469b4dfcfa6222fd78598800788c6259
--- a/manifest.uuid
+++ b/manifest.uuid
@ -1 +1 @@
-8d56118f64dbaf8c8006266fa7026f900a4a16bd
+13cf1ba8256bf8cee0195dbaeac71a20cd2c2fc2
--- a/src/sqlite.h.in
+++ b/src/sqlite.h.in
@ -12,7 +12,7 @@
 ** This header file defines the interface that the SQLite library
 ** presents to client programs.
 **
-** @(#) $Id: sqlite.h.in,v 1.97 2004/06/10 10:50:30 danielk1977 Exp $
+** @(#) $Id: sqlite.h.in,v 1.98 2004/06/11 17:48:03 drh Exp $
 */
 #ifndef _SQLITE_H_
 #define _SQLITE_H_
@ -963,9 +963,15 @@ void sqlite3_result_text(sqlite3_context*, const char*, int n, int eCopy);
 void sqlite3_result_text16(sqlite3_context*, const void*, int n, int eCopy);
 void sqlite3_result_value(sqlite3_context*, sqlite3_value*);

+/*
+** These are the allowed values for the eTextRep argument to
+** sqlite3_create_collation and sqlite3_create_function.
+*/
 #define SQLITE_UTF8    1
-#define SQLITE_UTF16LE 2
-#define SQLITE_UTF16BE 3
+#define SQLITE_UTF16   2    /* Use native byte order */
+#define SQLITE_UTF16LE 3
+#define SQLITE_UTF16BE 4
+#define SQLITE_ANY     5    /* sqlite3_create_function only */

 /*
 ** These two functions are used to add new collation sequences to the
--- a/www/capi3ref.tcl
+++ b/www/capi3ref.tcl
@ -1,4 +1,4 @@
-set rcsid {$Id: capi3ref.tcl,v 1.1 2004/06/01 01:22:38 drh Exp $}
+set rcsid {$Id: capi3ref.tcl,v 1.2 2004/06/11 17:48:04 drh Exp $}
 source common.tcl
 header {C/C++ Interface For SQLite Version 3}
 puts {
@ -69,13 +69,15 @@ api {} {
 }

 api {} {
-  int sqlite3_bind_blob(sqlite3_stmt*, int, const void*, int n, int eCopy);
+  int sqlite3_bind_blob(sqlite3_stmt*, int, const void*, int n, void(*)(void*));
  int sqlite3_bind_double(sqlite3_stmt*, int, double);
  int sqlite3_bind_int(sqlite3_stmt*, int, int);
  int sqlite3_bind_int64(sqlite3_stmt*, int, long long int);
  int sqlite3_bind_null(sqlite3_stmt*, int);
-  int sqlite3_bind_text(sqlite3_stmt*, int, const char*, int n, int eCopy);
-  int sqlite3_bind_text16(sqlite3_stmt*, int, const void*, int n, int eCopy);
+  int sqlite3_bind_text(sqlite3_stmt*, int, const char*, int n, void(*)(void*));
+  int sqlite3_bind_text16(sqlite3_stmt*, int, const void*, int n, void(*)(void*));
+  #define SQLITE_STATIC      ((void*)0)
+  #define SQLITE_EPHEMERAL   ((void*)8)
 } {
 In the SQL strings input to sqlite3_prepare() and sqlite3_prepare16(),
 one or more literals can be replace by a wildcard "?" or ":N:" where
@ -87,10 +89,13 @@ api {} {
 index of the wildcard.  The first "?" has an index of 1.  ":N:" wildcards
 use the index N.

- When the eCopy parameter is true, a copy of the value is made into
- memory obtained and managed by SQLite.  When eCopy is false, SQLite
- assumes that the value is a constant and just stores a pointer to the
- value without making a copy.
+ The fifth parameter to sqlite3_bind_blob(), sqlite3_bind_text(), and
+ sqlite3_bind_text16() is a destructor used to dispose of the BLOB or
+ text after SQLite has finished with it.  If the fifth argument is the
+ special value SQLITE_STATIC, then the library assumes that the information
+ is in static, unmanaged space and does not need to be freed.  If the
+ fifth argument has the value SQLITE_EPHEMERAL, then SQLite makes its
+ on private copy of the data.

 The sqlite3_bind_*() routine must be called after
 sqlite3_prepare() or sqlite3_reset() and before sqlite3_step().
@ -99,16 +104,16 @@ api {} {
 }

 api {} {
-  void sqlite3_busy_handler(sqlite*, int(*)(void*,const char*,int), void*);
+  void sqlite3_busy_handler(sqlite*, int(*)(void*,int), void*);
 } {
 This routine identifies a callback function that is invoked
 whenever an attempt is made to open a database table that is
 currently locked by another process or thread.  If the busy callback
 is NULL, then sqlite3_exec() returns SQLITE_BUSY immediately if
 it finds a locked table.  If the busy callback is not NULL, then
- sqlite3_exec() invokes the callback with three arguments.  The
- second argument is the name of the locked table and the third
- argument is the number of times the table has been busy.  If the
+ sqlite3_exec() invokes the callback with two arguments.  The
+ second argument is the number of prior calls to the busy callback
+ for the same lock.  If the
 busy callback returns 0, then sqlite3_exec() immediately returns
 SQLITE_BUSY.  If the callback returns non-zero, then sqlite3_exec()
 tries to open the table again and the cycle repeats.
@ -309,6 +314,90 @@ int sqlite3_complete16(const void *sql);
 false.
 } {}

+api {} {
+int sqlite3_create_collation(
+  sqlite3*, 
+  const char *zName, 
+  int pref16, 
+  void*,
+  int(*xCompare)(void*,int,const void*,int,const void*)
+);
+int sqlite3_create_collation16(
+  sqlite3*, 
+  const char *zName, 
+  int pref16, 
+  void*,
+  int(*xCompare)(void*,int,const void*,int,const void*)
+);
+#define SQLITE_UTF8     1
+#define SQLITE_UTF16    2
+#define SQLITE_UTF16BE  3
+#define SQLITE_UTF16LE  4
+} {
+ These two functions are used to add new collation sequences to the
+ sqlite3 handle specified as the first argument. 
+
+ The name of the new collation sequence is specified as a UTF-8 string
+ for sqlite3_create_collation() and a UTF-16 string for
+ sqlite3_create_collation16(). In both cases the name is passed as the
+ second function argument.
+
+ The third argument must be one of the constants SQLITE_UTF8,
+ SQLITE_UTF16LE or SQLITE_UTF16BE, indicating that the user-supplied
+ routine expects to be passed pointers to strings encoded using UTF-8,
+ UTF-16 little-endian or UTF-16 big-endian respectively.
+
+ A pointer to the user supplied routine must be passed as the fifth
+ argument. If it is NULL, this is the same as deleting the collation
+ sequence (so that SQLite cannot call it anymore). Each time the user
+ supplied function is invoked, it is passed a copy of the void* passed as
+ the fourth argument to sqlite3_create_collation() or
+ sqlite3_create_collation16() as its first parameter.
+
+ The remaining arguments to the user-supplied routine are two strings,
+ each represented by a [length, data] pair and encoded in the encoding
+ that was passed as the third argument when the collation sequence was
+ registered. The user routine should return negative, zero or positive if
+ the first string is less than, equal to, or greater than the second
+ string. i.e. (STRING1 - STRING2).
+}
+
+api {} {
+int sqlite3_collation_needed(
+  sqlite3*, 
+  void*, 
+  void(*)(void*,sqlite3*,int eTextRep,const char*)
+);
+int sqlite3_collation_needed16(
+  sqlite3*, 
+  void*,
+  void(*)(void*,sqlite3*,int eTextRep,const void*)
+);
+} {
+ To avoid having to register all collation sequences before a database
+ can be used, a single callback function may be registered with the
+ database handle to be called whenever an undefined collation sequence is
+ required.
+
+ If the function is registered using the sqlite3_collation_needed() API,
+ then it is passed the names of undefined collation sequences as strings
+ encoded in UTF-8. If sqlite3_collation_needed16() is used, the names
+ are passed as UTF-16 in machine native byte order. A call to either
+ function replaces any existing callback.
+
+ When the user-function is invoked, the first argument passed is a copy
+ of the second argument to sqlite3_collation_needed() or
+ sqlite3_collation_needed16(). The second argument is the database
+ handle. The third argument is one of SQLITE_UTF8, SQLITE_UTF16BE or
+ SQLITE_UTF16LE, indicating the most desirable form of the collation
+ sequence function required. The fourth parameter is the name of the
+ required collation sequence.
+
+ The collation sequence is returned to SQLite by a collation-needed
+ callback using the sqlite3_create_collation() or
+ sqlite3_create_collation16() APIs, described above.
+}
+
 api {} {
 int sqlite3_create_function(
  sqlite3 *,
@ -332,10 +421,11 @@ int sqlite3_create_function16(
  void (*xStep)(sqlite3_context*,int,sqlite3_value**),
  void (*xFinal)(sqlite3_context*)
 );
-#define SQLITE3_UTF8     1
-#define SQLITE3_UTF16LE  2
-#define SQLITE3_UTF16BE  3
-#define SQLITE3_ANY      4
+#define SQLITE_UTF8     1
+#define SQLITE_UTF16    2
+#define SQLITE_UTF16BE  3
+#define SQLITE_UTF16LE  4
+#define SQLITE_ANY      5
 } {
 These two functions are used to add user functions or aggregates
 implemented in C to the SQL langauge interpreted by SQLite. The
@ -620,13 +710,11 @@ char *sqlite3_vmprintf(const char*, va_list);
 api {} {
 int sqlite3_open(
  const char *filename,   /* Database filename (UTF-8) */
-  sqlite3 **ppDb,         /* OUT: SQLite db handle */
-  const char **args       /* Null terminated array of option strings */
+  sqlite3 **ppDb          /* OUT: SQLite db handle */
 );
 int sqlite3_open16(
  const void *filename,   /* Database filename (UTF-16) */
-  sqlite3 **ppDb,         /* OUT: SQLite db handle */
-  const char **args       /* Null terminated array of option strings */
+  sqlite3 **ppDb          /* OUT: SQLite db handle */
 );
 } {
 Open the sqlite database file "filename".  The "filename" is UTF-8
@ -637,8 +725,9 @@ int sqlite3_open16(
 sqlite3_errmsg() or sqlite3_errmsg16()  routines can be used to obtain
 an English language description of the error.

- If the database file does not exist, then a new database is created.
- The encoding for the database is UTF-8 if sqlite3_open() is called and
+ If the database file does not exist, then a new database will be created
+ as needed.
+ The encoding for the database will be UTF-8 if sqlite3_open() is called and
 UTF-16 if sqlite3_open16 is used.

 Whether or not an error occurs when it is opened, resources associated
@ -729,15 +818,17 @@ int sqlite3_reset(sqlite3_stmt *pStmt);
 }

 api {} {
-void sqlite3_result_blob(sqlite3_context*, const void*, int n, int eCopy);
+void sqlite3_result_blob(sqlite3_context*, const void*, int n, void(*)(void*));
 void sqlite3_result_double(sqlite3_context*, double);
 void sqlite3_result_error(sqlite3_context*, const char*, int);
 void sqlite3_result_error16(sqlite3_context*, const void*, int);
 void sqlite3_result_int(sqlite3_context*, int);
 void sqlite3_result_int64(sqlite3_context*, long long int);
 void sqlite3_result_null(sqlite3_context*);
-void sqlite3_result_text(sqlite3_context*, const char*, int n, int eCopy);
-void sqlite3_result_text16(sqlite3_context*, const void*, int n, int eCopy);
+void sqlite3_result_text(sqlite3_context*, const char*, int n, void(*)(void*));
+void sqlite3_result_text16(sqlite3_context*, const void*, int n, void(*)(void*));
+void sqlite3_result_text16be(sqlite3_context*, const void*, int n, void(*)(void*));
+void sqlite3_result_text16le(sqlite3_context*, const void*, int n, void(*)(void*));
 void sqlite3_result_value(sqlite3_context*, sqlite3_value*);
 } {
 User-defined functions invoke the following routines in order to
@ -864,6 +955,8 @@ int sqlite3_value_int(sqlite3_value*);
 long long int sqlite3_value_int64(sqlite3_value*);
 const unsigned char *sqlite3_value_text(sqlite3_value*);
 const void *sqlite3_value_text16(sqlite3_value*);
+const void *sqlite3_value_text16be(sqlite3_value*);
+const void *sqlite3_value_text16le(sqlite3_value*);
 int sqlite3_value_type(sqlite3_value*);
 } {
 This group of routines returns information about parameters to
--- a/www/docs.tcl
+++ b/www/docs.tcl
@ -1,7 +1,7 @@
 # This script generates the "docs.html" page that describes various
 # sources of documentation available for SQLite.
 #
-set rcsid {$Id: docs.tcl,v 1.3 2004/06/01 01:22:38 drh Exp $}
+set rcsid {$Id: docs.tcl,v 1.4 2004/06/11 17:48:04 drh Exp $}
 source common.tcl
 header {SQLite Documentation}
 puts {
@ -40,6 +40,11 @@ doc {Tcl API} {tclsqlite.html} {
  A description of the TCL interface bindings for SQLite.
 }

+doc {Locking And Concurrency<br>In SQLite Version 3} {lockingv3.html} {
+  A description of how the new locking code in version 3 increases
+  concurrancy and decreases the problem of writer starvation.
+}
+
 doc {Version 2 DataTypes } {datatypes.html} {
  A description of how SQLite version 2 handles SQL datatypes.
 }
--- a/www/lockingv3.tcl
+++ b/www/lockingv3.tcl
@ -0,0 +1,515 @@
+#
+# Run this script to generated a lockingv3.html output file
+#
+set rcsid {$Id: }
+source common.tcl
+header {File Locking And Concurrency In SQLite Version 3}
+
+proc HEADING {level title} {
+  global pnum
+  incr pnum($level)
+  foreach i [array names pnum] {
+    if {$i>$level} {set pnum($i) 0}
+  }
+  set h [expr {$level+1}]
+  if {$h>6} {set h 6}
+  set n $pnum(1).$pnum(2)
+  for {set i 3} {$i<=$level} {incr i} {
+    append n .$pnum($i)
+  }
+  puts "<h$h>$n $title</h$h>"
+}
+set pnum(1) 0
+set pnum(2) 0
+set pnum(3) 0
+set pnum(4) 0
+set pnum(5) 0
+set pnum(6) 0
+set pnum(7) 0
+set pnum(8) 0
+
+HEADING 1 {File Locking And Concurrency In SQLite Version 3}
+
+puts {
+<p>Version 3 of SQLite introduces a more sophisticated locking mechanism
+design to improve concurrency and reduce the writer starvation problem.
+This document describes the new locking mechanism.
+The intended audience is programmers who want to understand and/or modify
+the pager code and reviewers working to verify the design
+of SQLite version 3.
+</p>
+}
+
+HEADING 1 {Overview}
+
+puts {
+<p>
+Locking and concurrency control are handled by the the 
+<a href="http://www.sqlite.org/cvstrac/getfile/sqlite/src/pager.c">
+pager module</a>.
+The pager module is responsible for make SQLite "ACID" (Atomic,
+Consistent, Isolated, and Durable).  The pager module makes sure changes
+happen all at once, that either all changes occur or none of them do,
+that two or more threads or processes do not try to access the database
+in incompatible ways at the same time, and that once changes have been
+written they persist until explicitly deleted.  The pager also provides
+an memory cache of some of the contents of the disk file.</p>
+
+<p>The pager is unconcerned
+with the details of B-Trees, text encodings, indices, and so forth.
+From the point of view of the pager, the database consists of
+a single file of uniform-sized blocks.  Each block is called a
+"page" is is usually 1024 bytes in size.   The pages are numbered
+beginning with 1.  So the first 1024 bytes of the database are called
+"page 1" and the second 1024 bytes are call "page 2" and so forth. All 
+other encoding details are handled by higher layers of the library.  
+The pager communicates with the operating system using one of several
+modules 
+(Examples:
+<a href="http://www.sqlite.org/cvstrac/getfile/sqlite/src/os_unix.c">
+os_unix.c</a>,
+<a href="http://www.sqlite.org/cvstrac/getfile/sqlite/src/os_win.c">
+os_win.c</a>)
+that provides a uniform abstraction for operating system services.
+</p>
+}
+
+HEADING 1 {Locking}
+
+puts {
+<p>
+From the point of view of a single thread or process, a database file
+can be in one of five locking states:
+</p>
+
+<p>
+<table cellpadding="20">
+<tr><td valign="top">UNLOCKED</td>
+<td valign="top">
+No locks are held on the database.  The database may be neither read nor
+written.  Any internally cached data is considered suspect and subject to
+verification against the database file before being used.  Other threads
+and processes can read or write the database as their own locking states
+permit.  This is the default state.
+</td></tr>
+
+<tr><td valign="top">SHARED</td>
+<td valign="top">
+The database may be read but not written.  Any number of threads or
+processes can hold SHARED locks at the same time, hence there can be
+many simultaneous readers.  But no other thread or process is allowed
+to write to the database file while one or more SHARED locks are active.
+</td></tr>
+
+<tr><td valign="top">RESERVED</td>
+<td valign="top">
+A RESERVED lock means that the process is planning on writing to the
+database file at some point in the future but that it is currently just
+reading from the file.  Only a single RESERVED lock may be active at one
+time, though multiple SHARED locks can coexist with a single RESERVED lock.
+RESERVED differs from PENDING in that new SHARED locks can be acquired
+while there is a RESERVED lock.
+</td></tr>
+
+<tr><td valign="top">PENDING</td>
+<td valign="top">
+A PENDING lock means that the process holding the lock wants to write
+to the database as soon as possible and is just waiting on all current
+SHARED locks to clear so that it can get an EXCLUSIVE lock.  No new 
+SHARED locks are permitted against the database if
+a PENDING lock is active, though existing SHARED locks are allowed to
+continue.
+</td></tr>
+
+<tr><td valign="top">EXCLUSIVE</td>
+<td valign="top">
+An EXCLUSIVE lock is needed in order to write to the database file.
+Only one EXCLUSIVE lock is allowed on the file and no other locks of
+any kind are allowed to coexist with an EXCLUSIVE lock.  In order to
+maximize concurrency, SQLite works to minimize the amount of time that
+EXCLUSIVE locks are held.
+</td></tr>
+</table>
+</p>
+
+<p>
+The operating system interface layer understands and tracks all five
+locking states described above.  (It has to, since it is responsible
+for implementing the locks.)  But the pager module only tracks four
+of the five locking states.  A PENDING lock is always just a temporary
+stepping stone on the path to an EXCLUSIVE lock and so the pager module
+does not track PENDING locks.
+</p>
+}
+
+HEADING 1 {The Rollback Journal}
+
+puts {
+<p>Any time a process wants to make a changes to a database file, it
+first records enough information in the <em>rollback journal</em> to
+restore the database file back to its initial condition.  Thus, before
+altering any page of the database, the original contents of that page
+must be written into the journal.  The journal also records the initial
+size of the database so that if the database file grows it can be truncated
+back to its original size on a rollback.</p>
+
+<p>The rollback journal is a ordinary disk file that has the same name as
+the database file with the suffix "<tt>-journal</tt>" added.</p>
+
+<p>If SQLite is working with multiple databases at the same time
+(using the ATTACH command) then each database has its own journal.
+But there is also a separate aggregate journal called the "master journal".
+The master journal does not contain page data used for rolling back
+changes.  Instead the master journal contains the names of the
+individual file journals for each of the ATTACHed databases.   Each of
+the individual file journals also contain the name of the master journal.
+If there are no ATTACHed databases (or if none of the ATTACHed database
+is participating in the current transaction) no master journal is
+created and the normal rollback journal contains an empty string
+in the place normally reserved for recording the name of the master
+journal.</p>
+
+<p>A individual file journal is said to be "hot" if it needs to be rolled back
+in order to restore the integrity of its database.  
+A hot journal is created when a process is in the middle of a database
+update and a program or operating system crash or power failure prevents 
+the update from completing.
+Hot journals are an exception condition. 
+Hot journals exist to facility recovery from crashes and power failures.
+If everything is working correctly 
+(that is, if there are no crashes or power failures)
+you will never get a hot journal.
+</p>
+
+<p>
+If no master journal is involved, then
+a journal is hot if it exists and its corresponding database file
+does not have a RESERVED lock.
+If a master journal is named in the file journal, then the file journal
+is hot if its master journal exists and there is no RESERVED
+lock on the corresponding database file.
+</p>
+}
+
+HEADING 2 {Dealing with hot journals}
+
+puts {
+<p>
+Before reading from a a database file, SQLite always checks to see if that
+file has a hot journal.  If the file does have a hot journal, then the
+journal is rolled back before the file is read.  In this way, we ensure
+that the database file is in a consistent state before it is read.
+</p>
+
+<p>When a process wants to read from a database file, it followed
+the following sequence of steps:
+</p>
+
+<ol>
+<li>Open the database file and obtain a SHARED lock.  If the SHARED lock
+    cannot be obtained, fail immediately and return SQLITE_BUSY.</li>
+<li>Check to see if the database file has a hot journal.   If the file
+    does not have a hot journal, we are done.  Return immediately.
+    If there is a hot journal, that journal must be rolled back by
+    the subsequent steps of this algorithm.</li>
+<li>Acquire a PENDING then an EXCLUSIVE lock on the database file.
+    (Note: do not acquire a RESERVED lock because that would make
+    other processes think the journal was no longer hot.)  If we
+    fail to acquire this lock it means another process or thread
+    is already trying to do the rollback.  In that case,
+    drop all locks, close the database, and return SQLITE_BUSY. </li>
+<li>Read the journal file and roll back the changes.</li>
+<li>Wait for the rolled back changes to be written onto 
+    the surface of the disk.  This protects the integrity of the database
+    in case another power failure or crash occurs.</li>
+<li>Delete the journal file.</li>
+<li>Delete the master journal file if it is safe to do so.
+    This step is optional.  It is here only to prevent stale
+    master journals from cluttering up the disk drive.
+    See the discussion below for details.</li>
+<li>Drop the EXCLUSIVE and PENDING locks but retain the SHARED lock.</li>
+</ol>
+
+<p>After the algorithm above completes successfully, it is safe to 
+read from the database file.  Once all reading has completed, the
+SHARED lock is dropped.</p>
+}
+
+HEADING 2 {Deleting stale master journals}
+
+puts {
+<p>A stale master journal is a master journal that is no longer being
+used for anything.  There is no requirement that stale master journals
+be deleted.  The only reason for doing so is to free up disk space.</p>
+
+<p>A master journal is stale if no individual file journals are pointing
+to it.  To figure out if a master journal is stale, we first read the
+master journal to obtain the names of all of its file journals.  Then
+we check each of those file journals.  If any of the file journals named
+in the master journal exists and points back to the master journal, then
+the master journal is not stale.  If all file journals are either missing
+or refer to other master journals or no master journal at all, then the
+master journal we are testing is stale and can be safely deleted.</p>
+}
+
+HEADING 2 {Writing to a database file}
+
+puts {
+<p>To write to a database, a process must first acquire a SHARED lock
+as described above (possibly rolling back incomplete changes if there
+is a hot journal). 
+After a SHARED lock is obtained, a RESERVED lock must be acquired.
+The RESERVED lock signals that the process intentions to write to the
+database at some point in the future.  Only one process at a time
+can hold a reserved lock.  But other processes can continue to read
+the database while the RESERVED lock is held.
+</p>
+
+<p>If the process that wants to write is unable to obtain a RESERVED
+lock, it must mean that another process already has a RESERVED lock.
+In that case, the write attempt fails and returns SQLITE_BUSY.</p>
+
+<p>After obtaining a RESERVED lock, the process that wants to write
+creates a rollback journal.  The header of the journal is initialized
+with the original size of the database file.  Space in the journal header
+is also reserved for a master journal name, though the master journal
+name is initially empty.</p>
+
+<p>Before making changes to any page of the database, the process writes
+the original value of that page into the rollback journal.  Changes
+to pages are held in memory at first and are not written to the disk.
+The original database file remains unaltered, which means that other
+processes can continue to read the database.</p>
+
+<p>Eventually, the writing process will want to update the database
+file, either because its memory cache has filled up or because it is
+ready to commit its changes.  Before this happens, the writer must
+make sure no other process is reading the database and that the rollback
+journal data is safely on the disk surface so that it can be used to
+rollback incomplete changes in the event of a power failure.
+The steps are as follows:</p>
+
+<ol>
+<li>Make sure all rollback journal data has actually been written to
+    the surface of the disk (and is not just being held in the operating
+    system's  or disk controllers cache) so that if a power failure occurs
+    the data will still be there after power is restored.</li>
+<li>Obtain a PENDING lock and then an EXCLUSIVE lock on the database file.
+    If other processes are still have SHARED locks, the writer might have
+    to wait until those SHARED locks clear before it is able to obtain
+    an EXCLUSIVE lock.</li>
+<li>Write all page modifications currently held in memory out to the
+    original database disk file.</li>
+</ol>
+
+<p>
+If the reason for writing to the database file is because the memory
+cache was full, then the writer will not commit right away.  Instead,
+the writer might continue to make changes to other pages.  Before 
+subsequent changes are written to the database file, the rollback
+journal must be flushed to disk again.  Note also that the EXCLUSIVE
+lock that the writer obtained in order to write to the database initially
+must be held until all changes are committed.  That means that from the
+time the memory cache first spills to disk up until the transaction
+commits, no other processes are able to access the database.
+</p>
+
+<p>
+When a writer is ready to commit its changes, it executes the following
+steps:
+</p>
+
+<ol>
+<li value="4">
+   Obtain an EXCLUSIVE lock on the database file and
+   make sure all memory changes have been written to the database file
+   using the algorithm of steps 1-3 above.</li>
+<li>Flush all database file changes to the disk.  Wait for those changes
+    to actually be written onto the disk surface.</li>
+<li>Delete the journal file.  This is the instant when the changes are
+    committed.  Prior to deleting the journal file, if a power failure
+    or crash occurs, the next process to open the database will see that
+    it has a hot journal and will roll the changes back.
+    After the journal is deleted, there will no longer be a hot journal
+    and the changes will persist.
+    </li>
+<li>Drop the EXCLUSIVE and PENDING locks from the database file.
+    </li>
+</ol>
+
+<p>As soon as PENDING lock is released from the database file, other
+processes can begin reading the database again.  In the current implementation,
+the RESERVED lock is also released, but that is not essential.  Future
+versions of SQLite might provide a "CHECKPOINT" SQL command that will
+commit all changes made so far within a transaction but retain the
+RESERVED lock so that additional changes can be made without given
+any other process an opportunity to write.</p>
+
+<p>If a transaction involves multiple databases, then a more complex
+commit sequence is used, as follows:</p>
+
+<ol>
+<li value="4">
+   Make sure all individual database files have an EXCLUSIVE lock and a
+   valid journal.
+<li>Create a master-journal.  The name of the master-journal is arbitrary.
+    (The current implementation appends random suffixes to the name of the
+    main database file until it finds a name that does not previously exist.)
+    Fill the master journal with the names of all the individual journals
+    and flush its contents to disk.
+<li>Write the name of the master journal into
+    all individual journals (in space set aside for that purpose in the
+    headers of the individual journals) and flush the contents of the
+    individual journals to disk and wait for those changes to reach the
+    disk surface.
+<li>Flush all database file changes to the disk.  Wait for those changes
+    to actually be written onto the disk surface.</li>
+<li>Delete the master journal file.  This is the instant when the changes are
+    committed.  Prior to deleting the master journal file, if a power failure
+    or crash occurs, the individual file journals will be considered hot
+    and will be rolled back by the next process that
+    attempts to read them.  After the master journal has been deleted,
+    the file journals will no longer be considered hot and the changes
+    will persist.
+    </li>
+<li>Delete all individual journal files.
+<li>Drop the EXCLUSIVE and PENDING locks from all database files.
+    </li>
+</ol>
+}
+
+HEADING 1 {How To Corrupt Your Database Files}
+
+puts {
+<p>The pager module is robust but it is not completely failsafe.
+It can be subverted.  This section attempt to identify and explain
+the risks.</p>
+
+<p>
+Clearly, a hardware or operating system fault that introduces incorrect data
+into the middle of the database file or journal will cause problems.
+Likewise, 
+if a rogue process opens a database file or journal and writes malformed
+data into the middle of it, then the database will become corrupt.
+There is not much that can be done about these kinds of problems so
+so they are given no further attention.
+</p>
+
+<p>
+SQLite uses POSIX advisory locks to implement locking on Unix.  On
+windows it uses the LockFile(), LockFileEx(), and UnlockFile() system
+calls.  SQLite assumes that these system calls all work as advertised.  If
+that is not the case, then database corruption can result.  One should
+note that POSIX advisory locking is known to be buggy or even unimplemented
+on many NFS implementations (including recent versions of Mac OS X)
+and that there are persistent reports of locking problems
+for network filesystems under windows.  Your best defense is to not
+use SQLite for files on a network filesystem.
+</p>
+
+<p>
+SQLite uses the fsync() system call to flush data to the disk under Unix and
+it uses the FlushFileBuffers() to do the same under windows.  Once again,
+SQLite assumes that these operating system services function as advertised.
+But it has been reported that fsync() and FlushFileBuffers() do not always
+work correctly, especially with inexpensive IDE disks.  Apparently some
+manufactures of IDE disks have defective controller chips that report
+that data has reached the disk surface when in fact the data is still
+in volatile cache memory in the disk drive electronics.  There are also
+reports that windows sometimes chooses to ignore FlushFileBuffers() for
+unspecified reasons.  The author cannot verify any of these reports.
+But if they are true, it means that database corruption is a possibility
+following an unexpected power loss.  These are hardware and/or operating
+system bugs that SQLite is unable to defend against.
+</p>
+
+<p>
+If a crash or power failure occurs and results in a hot journal, but that
+journal is deleted.  The next process to open the database will not
+know that it contains changes that need to be rolled back.  The rollback
+will not occur and the database will be left in an inconsistent state.
+Rollback journals might be deleted for any number of reasons:
+</p>
+
+<ul>
+<li>An administrator might be cleaning up after an OS crash or power failure,
+    see the journal file, think it is junk, and delete it.</li>
+<li>Someone (or some process) might rename the database file but fail to
+    also rename its associated journal.</li>
+<li>If the database file has aliases (hard or soft links) and the file
+    is opened by a different alias than the one used to create the journal,
+    then the journal will not be found.  To avoid this problem, you should
+    not create links to SQLite database files.</li>
+<li>Filesystem corruption following a power failure might cause the
+    journal to be renamed or deleted.</li>
+</ul>
+
+<p>
+The last (fourth) bullet above merits additional comment.  When SQLite creates
+a journal file on Unix, it opens the directory that contains that file and
+calls fsync() on the directory, in an effort to push the directory information
+to disk.  But suppose some other process is adding or removing unrelated
+files to the directory that contains the database and journal at the the
+moment of a power failure.  The supposedly unrelated actions of this other
+process might in the journal file being dropped from the directory and
+moved into "lost+found".  This is an unlikely scenario, but it could happen.
+The best defenses are to use a journaling filesystem or to keep the
+database and journal in a directory by themselves.
+</p>
+
+<p>
+For a commit involving multiple databases and a master journal, if the
+various databases were on different disk volumes and a power failure occurs
+during the commit, then when the machine comes back up the disks might
+be remounted with different names.  Or some disks might not be mounted
+at all.   When this happens the individual file journals and the master
+journal might not be able to find each other. The worst outcome from
+this scenario is that the commit ceases to be atomic.  
+Some databases might be rolled back and others might not. 
+All databases will continue to be self-consistent.
+To defend against this problem, keep all databases
+on the same disk volume and/or remount disks using exactly the same names
+after a power failure.
+</p>
+}
+
+HEADING 1 {Transaction Control At The SQL Level}
+
+puts {
+<p>
+The changes to locking and concurrency control in SQLite version 3 also
+introduce some subtle changes in the way transactions work at the SQL
+language level.
+By default, SQLite version 3 operates in "autocommit" mode.  In autocommit mode,
+all changes to the database are committed as soon as all operations associated
+with the current database connection complete.</p>
+
+<p>The SQL command "BEGIN TRANSACTION" (the TRANSACTION keyword
+is optional) is used to take SQLite out of autocommit mode.
+Note that the BEGIN command does not acquire any locks on the database.
+After a BEGIN command, a SHARED lock will be acquired when the first
+SELECT statement is executed.  A RESERVED lock will be acquired when
+the first INSERT, UPDATE, or DELETE statement is executed.  No EXCLUSIVE
+locks is acquired until either the memory cache fills up and must
+be spilled to disk or until the transaction commits.  In this way,
+the system delays blocking read access to the file file until the
+last possible moment.
+</p>
+
+<p>The SQL command "COMMIT"  does not actually commit the changes to
+disk.  It just turns autocommit back on.  Then, at the conclusion of
+the command, the regular autocommit logic takes over and causes the
+actual commit to disk to occur.
+The SQL command "ROLLBACK" also operates by turning autocommit back on,
+but it also sets a flag that tells the autocommit logic to rollback rather
+than commit.</p>
+
+<p>If the SQL COMMIT command turns autocommit on and the autocommit logic
+then tries to commit change but fails because some other process is holding
+a SHARED lock, then autocommit is turned back off automatically.  This
+allows the user to retry the COMMIT at a later time after the SHARED lock
+has had an opportunity to clear.</p>
+}
+
+
+footer $rcsid