NetBSD/lib/libc/db/btree/storage.c

665 lines
15 KiB
C

/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Mike Olson.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)storage.c 5.2 (Berkeley) 2/22/91";
#endif /* LIBC_SCCS and not lint */
#include <sys/param.h>
#include <db.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "btree.h"
/*
* BT_GETPAGE -- Make pgno the current page of the btree.
*
* This routine is just a wrapper that decides whether to call the
* memory or disk-based routine to do the work.
*
* Parameters:
* t -- btree in which to get page
* pgno -- page number to get
*
* Returns:
* RET_SUCCESS or RET_ERROR.
*/
int
_bt_getpage(t, pgno)
BTREE_P t;
pgno_t pgno;
{
#ifdef DEBUG
if (pgno == P_NONE)
_punt();
#endif /* DEBUG */
/* see if we can get away without doing any work */
if (t->bt_curpage != (BTHEADER *) NULL) {
if (t->bt_curpage->h_pgno == pgno)
return (RET_SUCCESS);
}
if (t->bt_fname == (char *) NULL)
return (_bt_getmpage(t, pgno));
else
return (_bt_getdpage(t, pgno));
}
/*
* _BT_GETMPAGE -- Make pgno the current page of the btree.
*
* This routine gets pages for in-memory btrees.
*
* Parameters:
* t -- btree in which to get page
* pgno -- page number to get
*
* Returns:
* RET_SUCCESS or RET_ERROR.
*/
int
_bt_getmpage(t, pgno)
register BTREE_P t;
pgno_t pgno;
{
int htindex;
BTHEADER *h;
HTBUCKET *b;
if (t->bt_curpage == (BTHEADER *) NULL) {
if (pgno != P_ROOT) {
errno = EBADF;
return (RET_ERROR);
}
t->bt_npages++;
h = (BTHEADER *) malloc((unsigned) t->bt_psize);
if (h == (BTHEADER *) NULL)
return (RET_ERROR);
h->h_pgno = P_ROOT;
h->h_flags = F_LEAF;
h->h_lower = (index_t)
(((char *) &(h->h_linp[0])) - ((char *) h));
h->h_upper = t->bt_psize;
h->h_prevpg = h->h_nextpg = P_NONE;
t->bt_curpage = h;
/* get the root page into the hash table */
if (_bt_write(t, h, RELEASE) == RET_ERROR)
return (RET_ERROR);
}
htindex = HASHKEY(pgno);
for (b = t->bt_s.bt_ht[htindex];
b != (HTBUCKET *) NULL;
b = b->ht_next) {
if (b->ht_pgno == pgno) {
t->bt_curpage = b->ht_page;
return (RET_SUCCESS);
}
}
return (RET_ERROR);
}
/*
* _BT_GETDPAGE -- Make pgno the current page of the btree.
*
* This routine gets pages for disk btrees.
*
* Because disk btree pages must be readable across machine architectures,
* the btree code writes integers out in network format. This routine
* converts them back to host format before returning the page.
*
* Parameters:
* t -- btree in which to get page
* pgno -- page number to get
*
* Returns:
* RET_SUCCESS, RET_ERROR.
*/
int
_bt_getdpage(t, pgno)
register BTREE_P t;
pgno_t pgno;
{
BTHEADER *h;
char *cache;
long pos;
int n, nbytes;
/* if we have an lru cache, let the cache code do the work */
if (ISCACHE(t)) {
cache = t->bt_s.bt_d.d_cache;
/* release the old page */
if (t->bt_curpage != (BTHEADER *) NULL) {
pgno_t opgno = t->bt_curpage->h_pgno;
t->bt_curpage->h_flags &= ~F_DIRTY;
if (lruwrite(cache, (int) opgno) < 0)
return (RET_ERROR);
if (lrurelease(cache, (int) opgno) < 0)
return (RET_ERROR);
}
if (pgno > t->bt_npages) {
if ((h = (BTHEADER *) lrugetnew(cache, (int)pgno, &nbytes))
== (BTHEADER *) NULL)
return (RET_ERROR);
t->bt_npages = pgno;
} else {
if ((h = (BTHEADER *) lruget(cache, (int)pgno, &nbytes))
== (BTHEADER *) NULL)
return (RET_ERROR);
}
/* init this page, if necessary */
if (nbytes == 0) {
h->h_pgno = pgno;
h->h_flags = F_LEAF;
h->h_lower = (index_t)
(((char *) &(h->h_linp[0])) - ((char *) h));
h->h_upper = t->bt_psize;
h->h_prevpg = h->h_nextpg = P_NONE;
}
t->bt_curpage = h;
return (RET_SUCCESS);
}
/* sync the current page, if necessary */
if (t->bt_curpage != (BTHEADER *) NULL) {
if (t->bt_curpage->h_flags & F_DIRTY)
if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR)
return (RET_ERROR);
} else {
if (t->bt_npages == 0)
t->bt_npages = 1;
/* if no current page, get space for one */
if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize))
== (BTHEADER *) NULL) {
return (RET_ERROR);
}
}
n = t->bt_psize;
pos = (long) (pgno * n);
/* seek to correct location in file */
if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) {
return (RET_ERROR);
}
/* read the page */
if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) {
/*
* If we didn't get a full page, we must have gotten no
* data at all -- in which case we're trying to read a
* root page that doesn't exist yet. This is the only
* case in which this is okay. If this happens, construct
* an empty root page by hand.
*/
if (nbytes != 0 || pgno != P_ROOT) {
errno = EBADF;
return (RET_ERROR);
}
h = (BTHEADER *) t->bt_curpage;
h->h_pgno = pgno;
h->h_flags = F_LEAF;
h->h_lower = (index_t)
(((char *) &(h->h_linp[0])) - ((char *) h));
h->h_upper = t->bt_psize;
h->h_prevpg = h->h_nextpg = P_NONE;
} else
(void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder);
return (RET_SUCCESS);
}
/*
* _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from
* the host-independent format stored on disk.
*
* Parameters:
* h -- page to convert
* _lorder -- byte order for pages (stored as a char * in the
* cache, and passed around as a magic cookie).
*
* Returns:
* RET_SUCCESS (lru code requires a return value).
*
* Side Effects:
* Layout of tree metadata on the page is changed in place.
*
* Warnings:
* Everywhere else in the code, the types pgno_t and index_t
* are opaque. These two routines know what they really
* are.
*/
int
_bt_pgout(h, _lorder)
BTHEADER *h;
char *_lorder;
{
int i;
int top;
int lorder;
DATUM *d;
IDATUM *id;
size_t chain;
lorder = (int) _lorder;
if (lorder == BYTE_ORDER)
return (RET_SUCCESS);
if (h->h_flags & F_LEAF) {
if (h->h_flags & F_CONT) {
if (h->h_prevpg == P_NONE) {
size_t longsz;
(void) bcopy((char *) &(h->h_linp[0]),
(char *) &longsz,
sizeof(longsz));
BLSWAP(longsz);
(void) bcopy((char *) &longsz,
(char *) &(h->h_linp[0]),
sizeof(longsz));
}
} else {
top = NEXTINDEX(h);
for (i = 0; i < top; i++) {
d = (DATUM *) GETDATUM(h, i);
if (d->d_flags & D_BIGKEY) {
(void) bcopy((char *) &(d->d_bytes[0]),
(char *) &chain,
sizeof(chain));
BLSWAP(chain);
(void) bcopy((char *) &chain,
(char *) &(d->d_bytes[0]),
sizeof(chain));
}
if (d->d_flags & D_BIGDATA) {
(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
(char *) &chain,
sizeof(chain));
BLSWAP(chain);
(void) bcopy((char *) &chain,
(char *) &(d->d_bytes[d->d_ksize]),
sizeof(chain));
}
BLSWAP(d->d_dsize);
BLSWAP(d->d_ksize);
BLSWAP(d->d_flags);
BLSWAP(h->h_linp[i]);
}
}
} else {
top = NEXTINDEX(h);
for (i = 0; i < top; i++) {
id = (IDATUM *) GETDATUM(h, i);
BLSWAP(id->i_size);
BLSWAP(id->i_pgno);
BLSWAP(id->i_flags);
if (id->i_flags & D_BIGKEY) {
(void) bcopy((char *) &(id->i_bytes[0]),
(char *) &chain,
sizeof(chain));
BLSWAP(chain);
(void) bcopy((char *) &chain,
(char *) &(id->i_bytes[0]),
sizeof(chain));
}
BLSWAP(h->h_linp[i]);
}
}
BLSWAP(h->h_flags);
BLSWAP(h->h_pgno);
BLSWAP(h->h_prevpg);
BLSWAP(h->h_nextpg);
BLSWAP(h->h_lower);
BLSWAP(h->h_upper);
return (RET_SUCCESS);
}
int
_bt_pgin(h, _lorder)
BTHEADER *h;
char *_lorder;
{
int i;
int top;
int lorder;
DATUM *d;
IDATUM *id;
size_t chain;
/*
* If btree pages are to be stored in the host byte order, don't
* bother swapping.
*/
lorder = (int) _lorder;
if (lorder == BYTE_ORDER)
return (RET_SUCCESS);
BLSWAP(h->h_upper);
BLSWAP(h->h_lower);
BLSWAP(h->h_nextpg);
BLSWAP(h->h_prevpg);
BLSWAP(h->h_pgno);
BLSWAP(h->h_flags);
if (h->h_flags & F_LEAF) {
if (h->h_flags & F_CONT) {
if (h->h_prevpg == P_NONE) {
size_t longsz;
(void) bcopy((char *) &(h->h_linp[0]),
(char *) &longsz,
sizeof(longsz));
BLSWAP(longsz);
(void) bcopy((char *) &longsz,
(char *) &(h->h_linp[0]),
sizeof(longsz));
}
} else {
top = NEXTINDEX(h);
for (i = 0; i < top; i++) {
BLSWAP(h->h_linp[i]);
d = (DATUM *) GETDATUM(h, i);
BLSWAP(d->d_dsize);
BLSWAP(d->d_ksize);
BLSWAP(d->d_flags);
if (d->d_flags & D_BIGKEY) {
(void) bcopy((char *) &(d->d_bytes[0]),
(char *) &chain,
sizeof(chain));
BLSWAP(chain);
(void) bcopy((char *) &chain,
(char *) &(d->d_bytes[0]),
sizeof(chain));
}
if (d->d_flags & D_BIGDATA) {
(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
(char *) &chain,
sizeof(chain));
BLSWAP(chain);
(void) bcopy((char *) &chain,
(char *) &(d->d_bytes[d->d_ksize]),
sizeof(chain));
}
}
}
} else {
top = NEXTINDEX(h);
for (i = 0; i < top; i++) {
BLSWAP(h->h_linp[i]);
id = (IDATUM *) GETDATUM(h, i);
BLSWAP(id->i_size);
BLSWAP(id->i_pgno);
BLSWAP(id->i_flags);
if (id->i_flags & D_BIGKEY) {
(void) bcopy((char *) &(id->i_bytes[0]),
(char *) &chain,
sizeof(chain));
BLSWAP(chain);
(void) bcopy((char *) &chain,
(char *) &(id->i_bytes[0]),
sizeof(chain));
}
}
}
return (RET_SUCCESS);
}
/*
* _BT_ALLOCPG -- allocate a new page in the btree.
*
* This is called when we split a page, to get space to do the split.
* For disk btrees, these pages are released when the split is done.
* For memory btrees, they are not.
*
* Parameters:
* t -- tree in which to do split
*
* Returns:
* Pointer to the newly-allocated page
*/
BTHEADER *
_bt_allocpg(t)
BTREE_P t;
{
BTHEADER *h = t->bt_curpage;
BTHEADER *nh;
int nbytes;
/* if we have a cache, let the cache code do the work */
if (ISDISK(t) && ISCACHE(t)) {
nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache,
(int) (t->bt_npages + 1),
&nbytes);
} else {
nh = (BTHEADER *) malloc((unsigned) t->bt_psize);
}
if (nh != (BTHEADER *) NULL) {
nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE;
nh->h_flags = h->h_flags;
nh->h_lower = (index_t)
(((char *) &(nh->h_linp[0])) - ((char *) nh));
nh->h_upper = t->bt_psize;
}
return (nh);
}
/*
* _BT_WRITE -- Write a specific page to a btree file.
*
* Parameters:
* t -- btree to get the page
* h -- page to write
* relflag -- (int) this page may/may not be released
*
* Returns:
* RET_SUCCESS, RET_ERROR.
*
* Side Effects:
* Writes a metadata page if none has been written yet.
*/
int
_bt_write(t, h, relflag)
BTREE_P t;
BTHEADER *h;
int relflag;
{
long pos;
int htindex;
HTBUCKET *b;
char *cache;
pgno_t pgno;
h->h_flags &= ~F_DIRTY;
if (ISDISK(t)) {
/* if we haven't done so yet, write the metadata */
if (!(t->bt_flags & BTF_METAOK)) {
if (_bt_wrtmeta(t) == RET_ERROR)
return (RET_ERROR);
}
pgno = h->h_pgno;
/* if we have a cache, let the cache code do the work */
if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) {
if (lruwrite(cache, (int) pgno) < 0)
return (RET_ERROR);
if (relflag && lrurelease(cache, (int) pgno) < 0)
return (RET_ERROR);
} else {
(void) _bt_pgout(h, (char *) t->bt_lorder);
/* now write the current page */
pos = (long) (pgno * t->bt_psize);
if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos)
return (RET_ERROR);
if (write(t->bt_s.bt_d.d_fd, (char *) h, (int) t->bt_psize)
< t->bt_psize)
return (RET_ERROR);
if (!relflag)
(void) _bt_pgin(h, (char *) t->bt_lorder);
}
} else {
/* in-memory btree */
htindex = HASHKEY(h->h_pgno);
/* see if we need to overwrite existing entry */
for (b = t->bt_s.bt_ht[htindex];
b != (HTBUCKET *) NULL;
b = b->ht_next) {
if (b->ht_pgno == h->h_pgno) {
b->ht_page = h;
return (RET_SUCCESS);
}
}
/* new entry, write it */
b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET));
if (b == (HTBUCKET *) NULL)
return (RET_ERROR);
b->ht_pgno = h->h_pgno;
b->ht_page = h;
b->ht_next = t->bt_s.bt_ht[htindex];
t->bt_s.bt_ht[htindex] = b;
}
return (RET_SUCCESS);
}
/*
* _BT_RELEASE -- Release a locked-down cache page
*
* During page splits, we want to force pages out to the cache
* before we actually release them, in some cases. This routine
* releases such a page when it is no longer needed.
*
* Parameters:
* t -- btree in which to release page
* h -- page to release
*
* Returns:
* RET_SUCCESS, RET_ERROR.
*
* Side Effects:
* None.
*/
int
_bt_release(t, h)
BTREE_P t;
BTHEADER *h;
{
if (ISDISK(t) && ISCACHE(t)) {
if (lrurelease(t->bt_s.bt_d.d_cache, (int) (h->h_pgno)) < 0)
return (RET_ERROR);
}
return (RET_SUCCESS);
}
/*
* _BT_WRTMETA -- Write metadata to the btree.
*
* Parameters:
* t -- tree to which to write
*
* Returns:
* RET_SUCCESS, RET_ERROR.
*/
int
_bt_wrtmeta(t)
BTREE_P t;
{
BTMETA m;
if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l)
return (RET_ERROR);
/* lorder has to be in host-independent format */
m.m_lorder = (u_long) htonl((long) t->bt_lorder);
m.m_magic = BTREEMAGIC;
m.m_version = BTREEVERSION;
m.m_psize = t->bt_psize;
m.m_free = t->bt_free;
m.m_flags = t->bt_flags & BTF_NODUPS;
if (t->bt_lorder != BYTE_ORDER) {
BLSWAP(m.m_magic);
BLSWAP(m.m_version);
BLSWAP(m.m_psize);
BLSWAP(m.m_free);
BLSWAP(m.m_flags);
}
if (write(t->bt_s.bt_d.d_fd, (char *) &m, sizeof(m))
!= sizeof(m)) {
return (RET_ERROR);
}
t->bt_flags |= BTF_METAOK;
return (RET_SUCCESS);
}