Documentation of the new pager locking mechanism. (CVS 1570)

FossilOrigin-Name: 13cf1ba8256bf8cee0195dbaeac71a20cd2c2fc2
This commit is contained in:
drh 2004-06-11 17:48:02 +00:00
parent ce2663ccc8
commit 52619dfb7f
7 changed files with 664 additions and 40 deletions

View File

@ -419,6 +419,9 @@ index.html: $(TOP)/www/index.tcl last_change
lang.html: $(TOP)/www/lang.tcl
tclsh $(TOP)/www/lang.tcl >lang.html
lockingv3.html: $(TOP)/www/lockingv3.tcl
tclsh $(TOP)/www/lockingv3.tcl >lockingv3.html
omitted.html: $(TOP)/www/omitted.tcl
tclsh $(TOP)/www/omitted.tcl >omitted.html
@ -475,6 +478,7 @@ DOC = \
formatchng.html \
index.html \
lang.html \
lockingv3.html \
mingw.html \
nulls.html \
omitted.html \

View File

@ -1,5 +1,5 @@
C Have\sthe\svdbe\saggregator\suse\sa\sbtree\stable\sinstead\sof\sa\shash\stable.\s(CVS\s1569)
D 2004-06-11T13:19:21
C Documentation\sof\sthe\snew\spager\slocking\smechanism.\s(CVS\s1570)
D 2004-06-11T17:48:03
F Makefile.in ab7b0d5118e2da97bac66be8684a1034e3500f5a
F Makefile.linux-gcc a9e5a0d309fa7c38e7c14d3ecf7690879d3a5457
F README f1de682fbbd94899d50aca13d387d1b3fd3be2dd
@ -17,7 +17,7 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538
F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895
F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826
F main.mk 126f1ca2190fa19cc9944f54943cf3431891c330
F main.mk e02f35c8354fe9a0ab48fc87b0bcb3fb7a86d713
F mkdll.sh 68d34a961a1fdfa15ef27fc4f4740be583112124
F publish.sh 2e579b7474d57b6debcead93c73a49eb8cb81718
F spec.template a38492f1c1dd349fc24cb0565e08afc53045304b
@ -56,7 +56,7 @@ F src/printf.c 63b15f1ea9fe3daa066bb7430fd20d4a2d717dc8
F src/random.c eff68e3f257e05e81eae6c4d50a51eb88beb4ff3
F src/select.c d29488f86e61e0d45dff318e1f04ba6a7e5782d0
F src/shell.c ca519519dcbbc582f6d88f7d0e7583b857fd3469
F src/sqlite.h.in 2b6afe1de6935d3dfbd6042f46a62f1b7c3b3992
F src/sqlite.h.in 56c53344a6fbd76328d641d9ddab90d4e4ba9129
F src/sqliteInt.h 625faf4c9ce2f99b9c85a2bca5c4e73736c30262
F src/table.c af14284fa36c8d41f6829e3f2819dce07d3e2de2
F src/tclsqlite.c e974c0b2479ed37334aeb268de331e0a1b21b5a8
@ -194,7 +194,7 @@ F www/arch2.gif 49c6bb36160f03ca2b89eaa5bfb1f560c7d68ee7
F www/audit.tcl 90e09d580f79c7efec0c7d6f447b7ec5c2dce5c0
F www/c_interface.tcl 2176519fc2bd2d2cf6fe74fd806fc2d8362de2c8
F www/capi3.tcl a940c5ca0b0ebafd5acfe2ceca5a388bd8cfb468
F www/capi3ref.tcl 4248a45a0fd21fe0c116326300408d0999a028b5
F www/capi3ref.tcl 2cb0861c5219c6e44298c217a6c904464b4403bc
F www/changes.tcl cbe942607b2b9e5dc995395f37042dbb5a629c7f
F www/common.tcl f786e6be86fb2627ceb30e770e9efa83b9c67a3a
F www/conflict.tcl fb8a2ba83746c7fdfd9e52fa7f6aaf5c422b8246
@ -203,7 +203,7 @@ F www/copyright-release.pdf cfca3558fc97095e57c6117d08f1f5b80d95125a
F www/copyright.tcl 82c9670c7ddb0311912ab7fe24703f33c531066c
F www/datatype3.tcl f48b05cafd5e54ae5c05e643169d5217ee51a244
F www/datatypes.tcl 566004b81c36877397ddbe6e1907aae6065f6b41
F www/docs.tcl 0dcbf954907bd5dbfb7f1e0220f4e50516e07cd3
F www/docs.tcl a924a5043973a70c58b65abf1051ed8b1238864d
F www/download.tcl 8c84f15695c92cb01486930055fdf5192995f474
F www/dynload.tcl 02eb8273aa78cfa9070dd4501dca937fb22b466c
F www/faq.tcl 3a1776818d9bd973ab0c3048ec7ad6b1ad091ae5
@ -211,6 +211,7 @@ F www/fileformat.tcl f71a06a0d533c7df408539c64113b4adeaf29764
F www/formatchng.tcl 7ed8a5c871ab105f01e5defe1822ec39f70675bb
F www/index.tcl 64435502af780d7cd813365d443b9b9344662ce4
F www/lang.tcl fc528581c4a406cabc138e2f17db5ef6f38615ff
F www/lockingv3.tcl 1449cf3ff0249d6ed44e06b05244b423693d6265
F www/mingw.tcl d96b451568c5d28545fefe0c80bee3431c73f69c
F www/nulls.tcl f31330db8c978e675f5cd263067b32b822effa6f
F www/omitted.tcl 7bd62b6f0f53b60c5360895b16b3af8407bbca03
@ -222,7 +223,7 @@ F www/support.tcl 1801397edd271cc39a2aadd54e701184b5181248
F www/tclsqlite.tcl 19191cf2a1010eaeff74c51d83fd5f5a4d899075
F www/vdbe.tcl 59288db1ac5c0616296b26dce071c36cb611dfe9
F www/whentouse.tcl a8335bce47cc2fddb07f19052cb0cb4d9129a8e4
P 66835ee67051027456a536e33b2f88a741654525
R 17851c4c7a9d880622be1c17f4d88abf
U danielk1977
Z 37fd25919977ced5eda4bf17b983a171
P 8d56118f64dbaf8c8006266fa7026f900a4a16bd
R 452a2adcbc4985a31c30bc500c6b52fd
U drh
Z 469b4dfcfa6222fd78598800788c6259

View File

@ -1 +1 @@
8d56118f64dbaf8c8006266fa7026f900a4a16bd
13cf1ba8256bf8cee0195dbaeac71a20cd2c2fc2

View File

@ -12,7 +12,7 @@
** This header file defines the interface that the SQLite library
** presents to client programs.
**
** @(#) $Id: sqlite.h.in,v 1.97 2004/06/10 10:50:30 danielk1977 Exp $
** @(#) $Id: sqlite.h.in,v 1.98 2004/06/11 17:48:03 drh Exp $
*/
#ifndef _SQLITE_H_
#define _SQLITE_H_
@ -963,9 +963,15 @@ void sqlite3_result_text(sqlite3_context*, const char*, int n, int eCopy);
void sqlite3_result_text16(sqlite3_context*, const void*, int n, int eCopy);
void sqlite3_result_value(sqlite3_context*, sqlite3_value*);
/*
** These are the allowed values for the eTextRep argument to
** sqlite3_create_collation and sqlite3_create_function.
*/
#define SQLITE_UTF8 1
#define SQLITE_UTF16LE 2
#define SQLITE_UTF16BE 3
#define SQLITE_UTF16 2 /* Use native byte order */
#define SQLITE_UTF16LE 3
#define SQLITE_UTF16BE 4
#define SQLITE_ANY 5 /* sqlite3_create_function only */
/*
** These two functions are used to add new collation sequences to the

View File

@ -1,4 +1,4 @@
set rcsid {$Id: capi3ref.tcl,v 1.1 2004/06/01 01:22:38 drh Exp $}
set rcsid {$Id: capi3ref.tcl,v 1.2 2004/06/11 17:48:04 drh Exp $}
source common.tcl
header {C/C++ Interface For SQLite Version 3}
puts {
@ -69,13 +69,15 @@ api {} {
}
api {} {
int sqlite3_bind_blob(sqlite3_stmt*, int, const void*, int n, int eCopy);
int sqlite3_bind_blob(sqlite3_stmt*, int, const void*, int n, void(*)(void*));
int sqlite3_bind_double(sqlite3_stmt*, int, double);
int sqlite3_bind_int(sqlite3_stmt*, int, int);
int sqlite3_bind_int64(sqlite3_stmt*, int, long long int);
int sqlite3_bind_null(sqlite3_stmt*, int);
int sqlite3_bind_text(sqlite3_stmt*, int, const char*, int n, int eCopy);
int sqlite3_bind_text16(sqlite3_stmt*, int, const void*, int n, int eCopy);
int sqlite3_bind_text(sqlite3_stmt*, int, const char*, int n, void(*)(void*));
int sqlite3_bind_text16(sqlite3_stmt*, int, const void*, int n, void(*)(void*));
#define SQLITE_STATIC ((void*)0)
#define SQLITE_EPHEMERAL ((void*)8)
} {
In the SQL strings input to sqlite3_prepare() and sqlite3_prepare16(),
one or more literals can be replace by a wildcard "?" or ":N:" where
@ -87,10 +89,13 @@ api {} {
index of the wildcard. The first "?" has an index of 1. ":N:" wildcards
use the index N.
When the eCopy parameter is true, a copy of the value is made into
memory obtained and managed by SQLite. When eCopy is false, SQLite
assumes that the value is a constant and just stores a pointer to the
value without making a copy.
The fifth parameter to sqlite3_bind_blob(), sqlite3_bind_text(), and
sqlite3_bind_text16() is a destructor used to dispose of the BLOB or
text after SQLite has finished with it. If the fifth argument is the
special value SQLITE_STATIC, then the library assumes that the information
is in static, unmanaged space and does not need to be freed. If the
fifth argument has the value SQLITE_EPHEMERAL, then SQLite makes its
on private copy of the data.
The sqlite3_bind_*() routine must be called after
sqlite3_prepare() or sqlite3_reset() and before sqlite3_step().
@ -99,16 +104,16 @@ api {} {
}
api {} {
void sqlite3_busy_handler(sqlite*, int(*)(void*,const char*,int), void*);
void sqlite3_busy_handler(sqlite*, int(*)(void*,int), void*);
} {
This routine identifies a callback function that is invoked
whenever an attempt is made to open a database table that is
currently locked by another process or thread. If the busy callback
is NULL, then sqlite3_exec() returns SQLITE_BUSY immediately if
it finds a locked table. If the busy callback is not NULL, then
sqlite3_exec() invokes the callback with three arguments. The
second argument is the name of the locked table and the third
argument is the number of times the table has been busy. If the
sqlite3_exec() invokes the callback with two arguments. The
second argument is the number of prior calls to the busy callback
for the same lock. If the
busy callback returns 0, then sqlite3_exec() immediately returns
SQLITE_BUSY. If the callback returns non-zero, then sqlite3_exec()
tries to open the table again and the cycle repeats.
@ -309,6 +314,90 @@ int sqlite3_complete16(const void *sql);
false.
} {}
api {} {
int sqlite3_create_collation(
sqlite3*,
const char *zName,
int pref16,
void*,
int(*xCompare)(void*,int,const void*,int,const void*)
);
int sqlite3_create_collation16(
sqlite3*,
const char *zName,
int pref16,
void*,
int(*xCompare)(void*,int,const void*,int,const void*)
);
#define SQLITE_UTF8 1
#define SQLITE_UTF16 2
#define SQLITE_UTF16BE 3
#define SQLITE_UTF16LE 4
} {
These two functions are used to add new collation sequences to the
sqlite3 handle specified as the first argument.
The name of the new collation sequence is specified as a UTF-8 string
for sqlite3_create_collation() and a UTF-16 string for
sqlite3_create_collation16(). In both cases the name is passed as the
second function argument.
The third argument must be one of the constants SQLITE_UTF8,
SQLITE_UTF16LE or SQLITE_UTF16BE, indicating that the user-supplied
routine expects to be passed pointers to strings encoded using UTF-8,
UTF-16 little-endian or UTF-16 big-endian respectively.
A pointer to the user supplied routine must be passed as the fifth
argument. If it is NULL, this is the same as deleting the collation
sequence (so that SQLite cannot call it anymore). Each time the user
supplied function is invoked, it is passed a copy of the void* passed as
the fourth argument to sqlite3_create_collation() or
sqlite3_create_collation16() as its first parameter.
The remaining arguments to the user-supplied routine are two strings,
each represented by a [length, data] pair and encoded in the encoding
that was passed as the third argument when the collation sequence was
registered. The user routine should return negative, zero or positive if
the first string is less than, equal to, or greater than the second
string. i.e. (STRING1 - STRING2).
}
api {} {
int sqlite3_collation_needed(
sqlite3*,
void*,
void(*)(void*,sqlite3*,int eTextRep,const char*)
);
int sqlite3_collation_needed16(
sqlite3*,
void*,
void(*)(void*,sqlite3*,int eTextRep,const void*)
);
} {
To avoid having to register all collation sequences before a database
can be used, a single callback function may be registered with the
database handle to be called whenever an undefined collation sequence is
required.
If the function is registered using the sqlite3_collation_needed() API,
then it is passed the names of undefined collation sequences as strings
encoded in UTF-8. If sqlite3_collation_needed16() is used, the names
are passed as UTF-16 in machine native byte order. A call to either
function replaces any existing callback.
When the user-function is invoked, the first argument passed is a copy
of the second argument to sqlite3_collation_needed() or
sqlite3_collation_needed16(). The second argument is the database
handle. The third argument is one of SQLITE_UTF8, SQLITE_UTF16BE or
SQLITE_UTF16LE, indicating the most desirable form of the collation
sequence function required. The fourth parameter is the name of the
required collation sequence.
The collation sequence is returned to SQLite by a collation-needed
callback using the sqlite3_create_collation() or
sqlite3_create_collation16() APIs, described above.
}
api {} {
int sqlite3_create_function(
sqlite3 *,
@ -332,10 +421,11 @@ int sqlite3_create_function16(
void (*xStep)(sqlite3_context*,int,sqlite3_value**),
void (*xFinal)(sqlite3_context*)
);
#define SQLITE3_UTF8 1
#define SQLITE3_UTF16LE 2
#define SQLITE3_UTF16BE 3
#define SQLITE3_ANY 4
#define SQLITE_UTF8 1
#define SQLITE_UTF16 2
#define SQLITE_UTF16BE 3
#define SQLITE_UTF16LE 4
#define SQLITE_ANY 5
} {
These two functions are used to add user functions or aggregates
implemented in C to the SQL langauge interpreted by SQLite. The
@ -620,13 +710,11 @@ char *sqlite3_vmprintf(const char*, va_list);
api {} {
int sqlite3_open(
const char *filename, /* Database filename (UTF-8) */
sqlite3 **ppDb, /* OUT: SQLite db handle */
const char **args /* Null terminated array of option strings */
sqlite3 **ppDb /* OUT: SQLite db handle */
);
int sqlite3_open16(
const void *filename, /* Database filename (UTF-16) */
sqlite3 **ppDb, /* OUT: SQLite db handle */
const char **args /* Null terminated array of option strings */
sqlite3 **ppDb /* OUT: SQLite db handle */
);
} {
Open the sqlite database file "filename". The "filename" is UTF-8
@ -637,8 +725,9 @@ int sqlite3_open16(
sqlite3_errmsg() or sqlite3_errmsg16() routines can be used to obtain
an English language description of the error.
If the database file does not exist, then a new database is created.
The encoding for the database is UTF-8 if sqlite3_open() is called and
If the database file does not exist, then a new database will be created
as needed.
The encoding for the database will be UTF-8 if sqlite3_open() is called and
UTF-16 if sqlite3_open16 is used.
Whether or not an error occurs when it is opened, resources associated
@ -729,15 +818,17 @@ int sqlite3_reset(sqlite3_stmt *pStmt);
}
api {} {
void sqlite3_result_blob(sqlite3_context*, const void*, int n, int eCopy);
void sqlite3_result_blob(sqlite3_context*, const void*, int n, void(*)(void*));
void sqlite3_result_double(sqlite3_context*, double);
void sqlite3_result_error(sqlite3_context*, const char*, int);
void sqlite3_result_error16(sqlite3_context*, const void*, int);
void sqlite3_result_int(sqlite3_context*, int);
void sqlite3_result_int64(sqlite3_context*, long long int);
void sqlite3_result_null(sqlite3_context*);
void sqlite3_result_text(sqlite3_context*, const char*, int n, int eCopy);
void sqlite3_result_text16(sqlite3_context*, const void*, int n, int eCopy);
void sqlite3_result_text(sqlite3_context*, const char*, int n, void(*)(void*));
void sqlite3_result_text16(sqlite3_context*, const void*, int n, void(*)(void*));
void sqlite3_result_text16be(sqlite3_context*, const void*, int n, void(*)(void*));
void sqlite3_result_text16le(sqlite3_context*, const void*, int n, void(*)(void*));
void sqlite3_result_value(sqlite3_context*, sqlite3_value*);
} {
User-defined functions invoke the following routines in order to
@ -864,6 +955,8 @@ int sqlite3_value_int(sqlite3_value*);
long long int sqlite3_value_int64(sqlite3_value*);
const unsigned char *sqlite3_value_text(sqlite3_value*);
const void *sqlite3_value_text16(sqlite3_value*);
const void *sqlite3_value_text16be(sqlite3_value*);
const void *sqlite3_value_text16le(sqlite3_value*);
int sqlite3_value_type(sqlite3_value*);
} {
This group of routines returns information about parameters to

View File

@ -1,7 +1,7 @@
# This script generates the "docs.html" page that describes various
# sources of documentation available for SQLite.
#
set rcsid {$Id: docs.tcl,v 1.3 2004/06/01 01:22:38 drh Exp $}
set rcsid {$Id: docs.tcl,v 1.4 2004/06/11 17:48:04 drh Exp $}
source common.tcl
header {SQLite Documentation}
puts {
@ -40,6 +40,11 @@ doc {Tcl API} {tclsqlite.html} {
A description of the TCL interface bindings for SQLite.
}
doc {Locking And Concurrency<br>In SQLite Version 3} {lockingv3.html} {
A description of how the new locking code in version 3 increases
concurrancy and decreases the problem of writer starvation.
}
doc {Version 2 DataTypes } {datatypes.html} {
A description of how SQLite version 2 handles SQL datatypes.
}

515
www/lockingv3.tcl Normal file
View File

@ -0,0 +1,515 @@
#
# Run this script to generated a lockingv3.html output file
#
set rcsid {$Id: }
source common.tcl
header {File Locking And Concurrency In SQLite Version 3}
proc HEADING {level title} {
global pnum
incr pnum($level)
foreach i [array names pnum] {
if {$i>$level} {set pnum($i) 0}
}
set h [expr {$level+1}]
if {$h>6} {set h 6}
set n $pnum(1).$pnum(2)
for {set i 3} {$i<=$level} {incr i} {
append n .$pnum($i)
}
puts "<h$h>$n $title</h$h>"
}
set pnum(1) 0
set pnum(2) 0
set pnum(3) 0
set pnum(4) 0
set pnum(5) 0
set pnum(6) 0
set pnum(7) 0
set pnum(8) 0
HEADING 1 {File Locking And Concurrency In SQLite Version 3}
puts {
<p>Version 3 of SQLite introduces a more sophisticated locking mechanism
design to improve concurrency and reduce the writer starvation problem.
This document describes the new locking mechanism.
The intended audience is programmers who want to understand and/or modify
the pager code and reviewers working to verify the design
of SQLite version 3.
</p>
}
HEADING 1 {Overview}
puts {
<p>
Locking and concurrency control are handled by the the
<a href="http://www.sqlite.org/cvstrac/getfile/sqlite/src/pager.c">
pager module</a>.
The pager module is responsible for make SQLite "ACID" (Atomic,
Consistent, Isolated, and Durable). The pager module makes sure changes
happen all at once, that either all changes occur or none of them do,
that two or more threads or processes do not try to access the database
in incompatible ways at the same time, and that once changes have been
written they persist until explicitly deleted. The pager also provides
an memory cache of some of the contents of the disk file.</p>
<p>The pager is unconcerned
with the details of B-Trees, text encodings, indices, and so forth.
From the point of view of the pager, the database consists of
a single file of uniform-sized blocks. Each block is called a
"page" is is usually 1024 bytes in size. The pages are numbered
beginning with 1. So the first 1024 bytes of the database are called
"page 1" and the second 1024 bytes are call "page 2" and so forth. All
other encoding details are handled by higher layers of the library.
The pager communicates with the operating system using one of several
modules
(Examples:
<a href="http://www.sqlite.org/cvstrac/getfile/sqlite/src/os_unix.c">
os_unix.c</a>,
<a href="http://www.sqlite.org/cvstrac/getfile/sqlite/src/os_win.c">
os_win.c</a>)
that provides a uniform abstraction for operating system services.
</p>
}
HEADING 1 {Locking}
puts {
<p>
From the point of view of a single thread or process, a database file
can be in one of five locking states:
</p>
<p>
<table cellpadding="20">
<tr><td valign="top">UNLOCKED</td>
<td valign="top">
No locks are held on the database. The database may be neither read nor
written. Any internally cached data is considered suspect and subject to
verification against the database file before being used. Other threads
and processes can read or write the database as their own locking states
permit. This is the default state.
</td></tr>
<tr><td valign="top">SHARED</td>
<td valign="top">
The database may be read but not written. Any number of threads or
processes can hold SHARED locks at the same time, hence there can be
many simultaneous readers. But no other thread or process is allowed
to write to the database file while one or more SHARED locks are active.
</td></tr>
<tr><td valign="top">RESERVED</td>
<td valign="top">
A RESERVED lock means that the process is planning on writing to the
database file at some point in the future but that it is currently just
reading from the file. Only a single RESERVED lock may be active at one
time, though multiple SHARED locks can coexist with a single RESERVED lock.
RESERVED differs from PENDING in that new SHARED locks can be acquired
while there is a RESERVED lock.
</td></tr>
<tr><td valign="top">PENDING</td>
<td valign="top">
A PENDING lock means that the process holding the lock wants to write
to the database as soon as possible and is just waiting on all current
SHARED locks to clear so that it can get an EXCLUSIVE lock. No new
SHARED locks are permitted against the database if
a PENDING lock is active, though existing SHARED locks are allowed to
continue.
</td></tr>
<tr><td valign="top">EXCLUSIVE</td>
<td valign="top">
An EXCLUSIVE lock is needed in order to write to the database file.
Only one EXCLUSIVE lock is allowed on the file and no other locks of
any kind are allowed to coexist with an EXCLUSIVE lock. In order to
maximize concurrency, SQLite works to minimize the amount of time that
EXCLUSIVE locks are held.
</td></tr>
</table>
</p>
<p>
The operating system interface layer understands and tracks all five
locking states described above. (It has to, since it is responsible
for implementing the locks.) But the pager module only tracks four
of the five locking states. A PENDING lock is always just a temporary
stepping stone on the path to an EXCLUSIVE lock and so the pager module
does not track PENDING locks.
</p>
}
HEADING 1 {The Rollback Journal}
puts {
<p>Any time a process wants to make a changes to a database file, it
first records enough information in the <em>rollback journal</em> to
restore the database file back to its initial condition. Thus, before
altering any page of the database, the original contents of that page
must be written into the journal. The journal also records the initial
size of the database so that if the database file grows it can be truncated
back to its original size on a rollback.</p>
<p>The rollback journal is a ordinary disk file that has the same name as
the database file with the suffix "<tt>-journal</tt>" added.</p>
<p>If SQLite is working with multiple databases at the same time
(using the ATTACH command) then each database has its own journal.
But there is also a separate aggregate journal called the "master journal".
The master journal does not contain page data used for rolling back
changes. Instead the master journal contains the names of the
individual file journals for each of the ATTACHed databases. Each of
the individual file journals also contain the name of the master journal.
If there are no ATTACHed databases (or if none of the ATTACHed database
is participating in the current transaction) no master journal is
created and the normal rollback journal contains an empty string
in the place normally reserved for recording the name of the master
journal.</p>
<p>A individual file journal is said to be "hot" if it needs to be rolled back
in order to restore the integrity of its database.
A hot journal is created when a process is in the middle of a database
update and a program or operating system crash or power failure prevents
the update from completing.
Hot journals are an exception condition.
Hot journals exist to facility recovery from crashes and power failures.
If everything is working correctly
(that is, if there are no crashes or power failures)
you will never get a hot journal.
</p>
<p>
If no master journal is involved, then
a journal is hot if it exists and its corresponding database file
does not have a RESERVED lock.
If a master journal is named in the file journal, then the file journal
is hot if its master journal exists and there is no RESERVED
lock on the corresponding database file.
</p>
}
HEADING 2 {Dealing with hot journals}
puts {
<p>
Before reading from a a database file, SQLite always checks to see if that
file has a hot journal. If the file does have a hot journal, then the
journal is rolled back before the file is read. In this way, we ensure
that the database file is in a consistent state before it is read.
</p>
<p>When a process wants to read from a database file, it followed
the following sequence of steps:
</p>
<ol>
<li>Open the database file and obtain a SHARED lock. If the SHARED lock
cannot be obtained, fail immediately and return SQLITE_BUSY.</li>
<li>Check to see if the database file has a hot journal. If the file
does not have a hot journal, we are done. Return immediately.
If there is a hot journal, that journal must be rolled back by
the subsequent steps of this algorithm.</li>
<li>Acquire a PENDING then an EXCLUSIVE lock on the database file.
(Note: do not acquire a RESERVED lock because that would make
other processes think the journal was no longer hot.) If we
fail to acquire this lock it means another process or thread
is already trying to do the rollback. In that case,
drop all locks, close the database, and return SQLITE_BUSY. </li>
<li>Read the journal file and roll back the changes.</li>
<li>Wait for the rolled back changes to be written onto
the surface of the disk. This protects the integrity of the database
in case another power failure or crash occurs.</li>
<li>Delete the journal file.</li>
<li>Delete the master journal file if it is safe to do so.
This step is optional. It is here only to prevent stale
master journals from cluttering up the disk drive.
See the discussion below for details.</li>
<li>Drop the EXCLUSIVE and PENDING locks but retain the SHARED lock.</li>
</ol>
<p>After the algorithm above completes successfully, it is safe to
read from the database file. Once all reading has completed, the
SHARED lock is dropped.</p>
}
HEADING 2 {Deleting stale master journals}
puts {
<p>A stale master journal is a master journal that is no longer being
used for anything. There is no requirement that stale master journals
be deleted. The only reason for doing so is to free up disk space.</p>
<p>A master journal is stale if no individual file journals are pointing
to it. To figure out if a master journal is stale, we first read the
master journal to obtain the names of all of its file journals. Then
we check each of those file journals. If any of the file journals named
in the master journal exists and points back to the master journal, then
the master journal is not stale. If all file journals are either missing
or refer to other master journals or no master journal at all, then the
master journal we are testing is stale and can be safely deleted.</p>
}
HEADING 2 {Writing to a database file}
puts {
<p>To write to a database, a process must first acquire a SHARED lock
as described above (possibly rolling back incomplete changes if there
is a hot journal).
After a SHARED lock is obtained, a RESERVED lock must be acquired.
The RESERVED lock signals that the process intentions to write to the
database at some point in the future. Only one process at a time
can hold a reserved lock. But other processes can continue to read
the database while the RESERVED lock is held.
</p>
<p>If the process that wants to write is unable to obtain a RESERVED
lock, it must mean that another process already has a RESERVED lock.
In that case, the write attempt fails and returns SQLITE_BUSY.</p>
<p>After obtaining a RESERVED lock, the process that wants to write
creates a rollback journal. The header of the journal is initialized
with the original size of the database file. Space in the journal header
is also reserved for a master journal name, though the master journal
name is initially empty.</p>
<p>Before making changes to any page of the database, the process writes
the original value of that page into the rollback journal. Changes
to pages are held in memory at first and are not written to the disk.
The original database file remains unaltered, which means that other
processes can continue to read the database.</p>
<p>Eventually, the writing process will want to update the database
file, either because its memory cache has filled up or because it is
ready to commit its changes. Before this happens, the writer must
make sure no other process is reading the database and that the rollback
journal data is safely on the disk surface so that it can be used to
rollback incomplete changes in the event of a power failure.
The steps are as follows:</p>
<ol>
<li>Make sure all rollback journal data has actually been written to
the surface of the disk (and is not just being held in the operating
system's or disk controllers cache) so that if a power failure occurs
the data will still be there after power is restored.</li>
<li>Obtain a PENDING lock and then an EXCLUSIVE lock on the database file.
If other processes are still have SHARED locks, the writer might have
to wait until those SHARED locks clear before it is able to obtain
an EXCLUSIVE lock.</li>
<li>Write all page modifications currently held in memory out to the
original database disk file.</li>
</ol>
<p>
If the reason for writing to the database file is because the memory
cache was full, then the writer will not commit right away. Instead,
the writer might continue to make changes to other pages. Before
subsequent changes are written to the database file, the rollback
journal must be flushed to disk again. Note also that the EXCLUSIVE
lock that the writer obtained in order to write to the database initially
must be held until all changes are committed. That means that from the
time the memory cache first spills to disk up until the transaction
commits, no other processes are able to access the database.
</p>
<p>
When a writer is ready to commit its changes, it executes the following
steps:
</p>
<ol>
<li value="4">
Obtain an EXCLUSIVE lock on the database file and
make sure all memory changes have been written to the database file
using the algorithm of steps 1-3 above.</li>
<li>Flush all database file changes to the disk. Wait for those changes
to actually be written onto the disk surface.</li>
<li>Delete the journal file. This is the instant when the changes are
committed. Prior to deleting the journal file, if a power failure
or crash occurs, the next process to open the database will see that
it has a hot journal and will roll the changes back.
After the journal is deleted, there will no longer be a hot journal
and the changes will persist.
</li>
<li>Drop the EXCLUSIVE and PENDING locks from the database file.
</li>
</ol>
<p>As soon as PENDING lock is released from the database file, other
processes can begin reading the database again. In the current implementation,
the RESERVED lock is also released, but that is not essential. Future
versions of SQLite might provide a "CHECKPOINT" SQL command that will
commit all changes made so far within a transaction but retain the
RESERVED lock so that additional changes can be made without given
any other process an opportunity to write.</p>
<p>If a transaction involves multiple databases, then a more complex
commit sequence is used, as follows:</p>
<ol>
<li value="4">
Make sure all individual database files have an EXCLUSIVE lock and a
valid journal.
<li>Create a master-journal. The name of the master-journal is arbitrary.
(The current implementation appends random suffixes to the name of the
main database file until it finds a name that does not previously exist.)
Fill the master journal with the names of all the individual journals
and flush its contents to disk.
<li>Write the name of the master journal into
all individual journals (in space set aside for that purpose in the
headers of the individual journals) and flush the contents of the
individual journals to disk and wait for those changes to reach the
disk surface.
<li>Flush all database file changes to the disk. Wait for those changes
to actually be written onto the disk surface.</li>
<li>Delete the master journal file. This is the instant when the changes are
committed. Prior to deleting the master journal file, if a power failure
or crash occurs, the individual file journals will be considered hot
and will be rolled back by the next process that
attempts to read them. After the master journal has been deleted,
the file journals will no longer be considered hot and the changes
will persist.
</li>
<li>Delete all individual journal files.
<li>Drop the EXCLUSIVE and PENDING locks from all database files.
</li>
</ol>
}
HEADING 1 {How To Corrupt Your Database Files}
puts {
<p>The pager module is robust but it is not completely failsafe.
It can be subverted. This section attempt to identify and explain
the risks.</p>
<p>
Clearly, a hardware or operating system fault that introduces incorrect data
into the middle of the database file or journal will cause problems.
Likewise,
if a rogue process opens a database file or journal and writes malformed
data into the middle of it, then the database will become corrupt.
There is not much that can be done about these kinds of problems so
so they are given no further attention.
</p>
<p>
SQLite uses POSIX advisory locks to implement locking on Unix. On
windows it uses the LockFile(), LockFileEx(), and UnlockFile() system
calls. SQLite assumes that these system calls all work as advertised. If
that is not the case, then database corruption can result. One should
note that POSIX advisory locking is known to be buggy or even unimplemented
on many NFS implementations (including recent versions of Mac OS X)
and that there are persistent reports of locking problems
for network filesystems under windows. Your best defense is to not
use SQLite for files on a network filesystem.
</p>
<p>
SQLite uses the fsync() system call to flush data to the disk under Unix and
it uses the FlushFileBuffers() to do the same under windows. Once again,
SQLite assumes that these operating system services function as advertised.
But it has been reported that fsync() and FlushFileBuffers() do not always
work correctly, especially with inexpensive IDE disks. Apparently some
manufactures of IDE disks have defective controller chips that report
that data has reached the disk surface when in fact the data is still
in volatile cache memory in the disk drive electronics. There are also
reports that windows sometimes chooses to ignore FlushFileBuffers() for
unspecified reasons. The author cannot verify any of these reports.
But if they are true, it means that database corruption is a possibility
following an unexpected power loss. These are hardware and/or operating
system bugs that SQLite is unable to defend against.
</p>
<p>
If a crash or power failure occurs and results in a hot journal, but that
journal is deleted. The next process to open the database will not
know that it contains changes that need to be rolled back. The rollback
will not occur and the database will be left in an inconsistent state.
Rollback journals might be deleted for any number of reasons:
</p>
<ul>
<li>An administrator might be cleaning up after an OS crash or power failure,
see the journal file, think it is junk, and delete it.</li>
<li>Someone (or some process) might rename the database file but fail to
also rename its associated journal.</li>
<li>If the database file has aliases (hard or soft links) and the file
is opened by a different alias than the one used to create the journal,
then the journal will not be found. To avoid this problem, you should
not create links to SQLite database files.</li>
<li>Filesystem corruption following a power failure might cause the
journal to be renamed or deleted.</li>
</ul>
<p>
The last (fourth) bullet above merits additional comment. When SQLite creates
a journal file on Unix, it opens the directory that contains that file and
calls fsync() on the directory, in an effort to push the directory information
to disk. But suppose some other process is adding or removing unrelated
files to the directory that contains the database and journal at the the
moment of a power failure. The supposedly unrelated actions of this other
process might in the journal file being dropped from the directory and
moved into "lost+found". This is an unlikely scenario, but it could happen.
The best defenses are to use a journaling filesystem or to keep the
database and journal in a directory by themselves.
</p>
<p>
For a commit involving multiple databases and a master journal, if the
various databases were on different disk volumes and a power failure occurs
during the commit, then when the machine comes back up the disks might
be remounted with different names. Or some disks might not be mounted
at all. When this happens the individual file journals and the master
journal might not be able to find each other. The worst outcome from
this scenario is that the commit ceases to be atomic.
Some databases might be rolled back and others might not.
All databases will continue to be self-consistent.
To defend against this problem, keep all databases
on the same disk volume and/or remount disks using exactly the same names
after a power failure.
</p>
}
HEADING 1 {Transaction Control At The SQL Level}
puts {
<p>
The changes to locking and concurrency control in SQLite version 3 also
introduce some subtle changes in the way transactions work at the SQL
language level.
By default, SQLite version 3 operates in "autocommit" mode. In autocommit mode,
all changes to the database are committed as soon as all operations associated
with the current database connection complete.</p>
<p>The SQL command "BEGIN TRANSACTION" (the TRANSACTION keyword
is optional) is used to take SQLite out of autocommit mode.
Note that the BEGIN command does not acquire any locks on the database.
After a BEGIN command, a SHARED lock will be acquired when the first
SELECT statement is executed. A RESERVED lock will be acquired when
the first INSERT, UPDATE, or DELETE statement is executed. No EXCLUSIVE
locks is acquired until either the memory cache fills up and must
be spilled to disk or until the transaction commits. In this way,
the system delays blocking read access to the file file until the
last possible moment.
</p>
<p>The SQL command "COMMIT" does not actually commit the changes to
disk. It just turns autocommit back on. Then, at the conclusion of
the command, the regular autocommit logic takes over and causes the
actual commit to disk to occur.
The SQL command "ROLLBACK" also operates by turning autocommit back on,
but it also sets a flag that tells the autocommit logic to rollback rather
than commit.</p>
<p>If the SQL COMMIT command turns autocommit on and the autocommit logic
then tries to commit change but fails because some other process is holding
a SHARED lock, then autocommit is turned back off automatically. This
allows the user to retry the COMMIT at a later time after the SHARED lock
has had an opportunity to clear.</p>
}
footer $rcsid