diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index d9da5e1ee3..ad93e81640 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.27 2003/08/04 02:39:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.28 2003/09/01 20:26:34 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -155,7 +155,7 @@ _hash_insertonpg(Relation rel, * page with enough room. allocate a new overflow page. */ do_expand = true; - ovflbuf = _hash_addovflpage(rel, &metabuf, buf); + ovflbuf = _hash_addovflpage(rel, metabuf, buf); _hash_relbuf(rel, buf, HASH_WRITE); buf = ovflbuf; page = BufferGetPage(buf); @@ -186,18 +186,15 @@ _hash_insertonpg(Relation rel, * access type just for a moment to allow greater accessibility to * the metapage. */ - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, - HASH_READ, HASH_WRITE); - metap->hashm_nkeys += 1; - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, - HASH_WRITE, HASH_READ); - + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_WRITE); + metap->hashm_ntuples += 1; + _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_READ); } _hash_wrtbuf(rel, buf); if (do_expand || - (metap->hashm_nkeys / (metap->hashm_maxbucket + 1)) + (metap->hashm_ntuples / (metap->hashm_maxbucket + 1)) > metap->hashm_ffactor) _hash_expandtable(rel, metabuf); _hash_relbuf(rel, metabuf, HASH_READ); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 06233b817b..aa74c547da 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.37 2003/08/04 02:39:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.38 2003/09/01 20:26:34 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -20,24 +20,73 @@ #include "access/hash.h" -static OverflowPageAddress _hash_getovfladdr(Relation rel, Buffer *metabufp); +static BlockNumber _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); + +/* + * Convert overflow page bit number (its index in the free-page bitmaps) + * to block number within the index. + */ +static BlockNumber +bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) +{ + uint32 splitnum = metap->hashm_ovflpoint; + uint32 i; + + /* Convert zero-based bitnumber to 1-based page number */ + ovflbitnum += 1; + + /* Determine the split number for this page (must be >= 1) */ + for (i = 1; + i < splitnum && ovflbitnum > metap->hashm_spares[i]; + i++) + /* loop */ ; + + /* + * Convert to absolute page number by adding the number of bucket pages + * that exist before this split point. + */ + return (BlockNumber) ((1 << i) + ovflbitnum); +} + +/* + * Convert overflow page block number to bit number for free-page bitmap. + */ +static uint32 +blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) +{ + uint32 splitnum = metap->hashm_ovflpoint; + uint32 i; + uint32 bitnum; + + /* Determine the split number containing this page */ + for (i = 1; i <= splitnum; i++) + { + if (ovflblkno <= (BlockNumber) (1 << i)) + break; /* oops */ + bitnum = ovflblkno - (1 << i); + if (bitnum <= metap->hashm_spares[i]) + return bitnum - 1; /* -1 to convert 1-based to 0-based */ + } + + elog(ERROR, "invalid overflow block number %u", ovflblkno); + return 0; /* keep compiler quiet */ +} + /* * _hash_addovflpage * * Add an overflow page to the page currently pointed to by the buffer * argument 'buf'. * - * *Metabufp has a read lock upon entering the function; buf has a - * write lock. - * + * metabuf has a read lock upon entering the function; buf has a + * write lock. The same is true on exit. The returned overflow page + * is write-locked. */ Buffer -_hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf) +_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) { - - OverflowPageAddress oaddr; BlockNumber ovflblkno; Buffer ovflbuf; HashMetaPage metap; @@ -52,17 +101,12 @@ _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf) pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(!BlockNumberIsValid(pageopaque->hasho_nextblkno)); - metap = (HashMetaPage) BufferGetPage(*metabufp); + metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage((Page) metap, LH_META_PAGE); /* allocate an empty overflow page */ - oaddr = _hash_getovfladdr(rel, metabufp); - if (oaddr == InvalidOvflAddress) - elog(ERROR, "_hash_getovfladdr failed"); - ovflblkno = OADDR_TO_BLKNO(OADDR_OF(SPLITNUM(oaddr), OPAGENUM(oaddr))); - Assert(BlockNumberIsValid(ovflblkno)); + ovflblkno = _hash_getovflpage(rel, metabuf); ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE); - Assert(BufferIsValid(ovflbuf)); ovflpage = BufferGetPage(ovflbuf); /* initialize the new overflow page */ @@ -71,7 +115,7 @@ _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf) ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; - ovflopaque->hasho_oaddr = oaddr; + ovflopaque->hasho_oaddr = 0; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; _hash_wrtnorelbuf(ovflbuf); @@ -82,191 +126,141 @@ _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf) } /* - * _hash_getovfladdr() + * _hash_getovflpage() * - * Find an available overflow page and return its address. + * Find an available overflow page and return its block number. * - * When we enter this function, we have a read lock on *metabufp which + * When we enter this function, we have a read lock on metabuf which * we change to a write lock immediately. Before exiting, the write lock * is exchanged for a read lock. - * */ -static OverflowPageAddress -_hash_getovfladdr(Relation rel, Buffer *metabufp) +static BlockNumber +_hash_getovflpage(Relation rel, Buffer metabuf) { HashMetaPage metap; Buffer mapbuf = 0; BlockNumber blkno; - PageOffset offset; - OverflowPageAddress oaddr; - SplitNumber splitnum; + uint32 splitnum; uint32 *freep = NULL; - uint32 max_free; + uint32 max_ovflpg; uint32 bit; uint32 first_page; - uint32 free_bit; - uint32 free_page; - uint32 in_use_bits; + uint32 last_bit; + uint32 last_page; uint32 i, j; - metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE); - + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_WRITE); + metap = (HashMetaPage) BufferGetPage(metabuf); splitnum = metap->hashm_ovflpoint; - max_free = metap->hashm_spares[splitnum]; - free_page = (max_free - 1) >> (metap->hashm_bshift + BYTE_TO_BIT); - free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1); + /* end search with the last existing overflow page */ + max_ovflpg = metap->hashm_spares[splitnum] - 1; + last_page = max_ovflpg >> BMPG_SHIFT(metap); + last_bit = max_ovflpg & BMPG_MASK(metap); - /* Look through all the free maps to find the first free block */ - first_page = metap->hashm_lastfreed >> (metap->hashm_bshift + BYTE_TO_BIT); - for (i = first_page; i <= free_page; i++) + /* start search at hashm_firstfree */ + first_page = metap->hashm_firstfree >> BMPG_SHIFT(metap); + bit = metap->hashm_firstfree & BMPG_MASK(metap); + j = bit / BITS_PER_MAP; + bit &= ~(BITS_PER_MAP - 1); + + for (i = first_page; i <= last_page; i++) { + BlockNumber mapblkno; Page mappage; + uint32 last_inpage; - blkno = metap->hashm_mapp[i]; - mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE); + mapblkno = metap->hashm_mapp[i]; + mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE); mappage = BufferGetPage(mapbuf); _hash_checkpage(mappage, LH_BITMAP_PAGE); freep = HashPageGetBitmap(mappage); - Assert(freep); - if (i == free_page) - in_use_bits = free_bit; - else - in_use_bits = BMPGSZ_BIT(metap) - 1; - - if (i == first_page) - { - bit = metap->hashm_lastfreed & (BMPGSZ_BIT(metap) - 1); - j = bit / BITS_PER_MAP; - bit = bit & ~(BITS_PER_MAP - 1); - } - else + if (i != first_page) { bit = 0; j = 0; } - for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP) + + if (i == last_page) + last_inpage = last_bit; + else + last_inpage = BMPGSZ_BIT(metap) - 1; + + for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) + { if (freep[j] != ALL_SET) goto found; + } + + _hash_relbuf(rel, mapbuf, HASH_WRITE); } /* No Free Page Found - have to allocate a new page */ - metap->hashm_lastfreed = metap->hashm_spares[splitnum]; + bit = metap->hashm_spares[splitnum]; metap->hashm_spares[splitnum]++; - offset = metap->hashm_spares[splitnum] - - (splitnum ? metap->hashm_spares[splitnum - 1] : 0); - - if (offset > SPLITMASK) - { - if (++splitnum >= NCACHED) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of overflow pages in hash index \"%s\"", - RelationGetRelationName(rel)))); - metap->hashm_ovflpoint = splitnum; - metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1]; - metap->hashm_spares[splitnum - 1]--; - offset = 0; - } /* Check if we need to allocate a new bitmap page */ - if (free_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) + if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) { - /* won't be needing old map page */ - - _hash_relbuf(rel, mapbuf, HASH_WRITE); - - free_page++; - if (free_page >= NCACHED) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of overflow pages in hash index \"%s\"", - RelationGetRelationName(rel)))); - /* - * This is tricky. The 1 indicates that you want the new page - * allocated with 1 clear bit. Actually, you are going to - * allocate 2 pages from this map. The first is going to be the - * map page, the second is the overflow page we were looking for. - * The init_bitmap routine automatically, sets the first bit of - * itself to indicate that the bitmap itself is in use. We would - * explicitly set the second bit, but don't have to if we tell - * init_bitmap not to leave it clear in the first place. + * We create the new bitmap page with all pages marked "in use". + * Actually two pages in the new bitmap's range will exist + * immediately: the bitmap page itself, and the following page + * which is the one we return to the caller. Both of these are + * correctly marked "in use". Subsequent pages do not exist yet, + * but it is convenient to pre-mark them as "in use" too. */ - if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset), - 1, free_page)) - elog(ERROR, "_hash_initbitmap failed"); + _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit)); + + bit = metap->hashm_spares[splitnum]; metap->hashm_spares[splitnum]++; - offset++; - if (offset > SPLITMASK) - { - if (++splitnum >= NCACHED) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of overflow pages in hash index \"%s\"", - RelationGetRelationName(rel)))); - metap->hashm_ovflpoint = splitnum; - metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1]; - metap->hashm_spares[splitnum - 1]--; - offset = 0; - } } else { /* - * Free_bit addresses the last used bit. Bump it to address the - * first available bit. + * Nothing to do here; since the page was past the last used page, + * we know its bitmap bit was preinitialized to "in use". */ - free_bit++; - SETBIT(freep, free_bit); - _hash_wrtbuf(rel, mapbuf); } + /* mark new page as first free so we don't search much next time */ + metap->hashm_firstfree = bit; + /* Calculate address of the new overflow page */ - oaddr = OADDR_OF(splitnum, offset); - _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ); - return oaddr; + blkno = bitno_to_blkno(metap, bit); + + _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_READ); + + return blkno; found: - bit = bit + _hash_firstfreebit(freep[j]); + /* convert bit to bit number within page */ + bit += _hash_firstfreebit(freep[j]); + + /* mark page "in use" */ SETBIT(freep, bit); _hash_wrtbuf(rel, mapbuf); - /* - * Bits are addressed starting with 0, but overflow pages are - * addressed beginning at 1. Bit is a bit addressnumber, so we need to - * increment it to convert it to a page number. - */ + /* convert bit to absolute bit number */ + bit += (i << BMPG_SHIFT(metap)); - bit = 1 + bit + (i * BMPGSZ_BIT(metap)); - if (bit >= metap->hashm_lastfreed) - metap->hashm_lastfreed = bit - 1; + /* adjust hashm_firstfree to avoid redundant searches */ + if (bit > metap->hashm_firstfree) + metap->hashm_firstfree = bit; - /* Calculate the split number for this page */ - for (i = 0; (i < splitnum) && (bit > metap->hashm_spares[i]); i++) - ; - offset = (i ? bit - metap->hashm_spares[i - 1] : bit); - if (offset >= SPLITMASK) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of overflow pages in hash index \"%s\"", - RelationGetRelationName(rel)))); + blkno = bitno_to_blkno(metap, bit); - /* initialize this page */ - oaddr = OADDR_OF(i, offset); - _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ); - return oaddr; + _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_READ); + + return blkno; } /* * _hash_firstfreebit() * - * Return the first bit that is not set in the argument 'map'. This - * function is used to find an available overflow page within a - * splitnumber. - * + * Return the number of the first bit that is not set in the word 'map'. */ static uint32 _hash_firstfreebit(uint32 map) @@ -279,7 +273,7 @@ _hash_firstfreebit(uint32 map) { if (!(mask & map)) return i; - mask = mask << 1; + mask <<= 1; } return i; } @@ -287,27 +281,29 @@ _hash_firstfreebit(uint32 map) /* * _hash_freeovflpage() - * - * Mark this overflow page as free and return a buffer with - * the page that follows it (which may be defined as - * InvalidBuffer). + * Remove this overflow page from its bucket's chain, and mark the page as + * free. On entry, ovflbuf is write-locked; it is released before exiting. * + * Returns the block number of the page that followed the given page + * in the bucket, or InvalidBlockNumber if no following page. + * + * NB: caller must not hold lock on metapage. */ -Buffer +BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; + BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; - OverflowPageAddress addr; - SplitNumber splitnum; uint32 *freep; - uint32 ovflpgno; + uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket; @@ -316,10 +312,10 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage((Page) metap, LH_META_PAGE); + ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); _hash_checkpage(ovflpage, LH_OVERFLOW_PAGE); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); - addr = ovflopaque->hasho_oaddr; nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; bucket = ovflopaque->hasho_bucket; @@ -359,20 +355,17 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) } /* - * Fix up the overflow page bitmap that tracks this particular - * overflow page. The bitmap can be found in the MetaPageData array - * element hashm_mapp[bitmappage]. + * Clear the bitmap bit to indicate that this overflow page is free. */ - splitnum = (addr >> SPLITSHIFT); - ovflpgno = (splitnum ? metap->hashm_spares[splitnum - 1] : 0) + (addr & SPLITMASK) - 1; + ovflbitno = blkno_to_bitno(metap, ovflblkno); - if (ovflpgno < metap->hashm_lastfreed) - metap->hashm_lastfreed = ovflpgno; - - bitmappage = (ovflpgno >> (metap->hashm_bshift + BYTE_TO_BIT)); - bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1); + bitmappage = ovflbitno >> BMPG_SHIFT(metap); + bitmapbit = ovflbitno & BMPG_MASK(metap); + if (bitmappage >= metap->hashm_nmaps) + elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; + mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE); mappage = BufferGetPage(mapbuf); _hash_checkpage(mappage, LH_BITMAP_PAGE); @@ -380,16 +373,13 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) CLRBIT(freep, bitmapbit); _hash_wrtbuf(rel, mapbuf); - _hash_relbuf(rel, metabuf, HASH_WRITE); + /* if this is now the first free page, update hashm_firstfree */ + if (ovflbitno < metap->hashm_firstfree) + metap->hashm_firstfree = ovflbitno; - /* - * now instantiate the page that replaced this one, if it exists, and - * return that buffer with a write lock. - */ - if (BlockNumberIsValid(nextblkno)) - return _hash_getbuf(rel, nextblkno, HASH_WRITE); - else - return InvalidBuffer; + _hash_wrtbuf(rel, metabuf); + + return nextblkno; } @@ -397,65 +387,49 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) * _hash_initbitmap() * * Initialize a new bitmap page. The metapage has a write-lock upon - * entering the function. + * entering the function, and must be written by caller after return. * - * 'pnum' is the OverflowPageAddress of the new bitmap page. - * 'nbits' is how many bits to clear (i.e., make available) in the new - * bitmap page. the remainder of the bits (as well as the first bit, - * representing the bitmap page itself) will be set. - * 'ndx' is the 0-based offset of the new bitmap page within the - * metapage's array of bitmap page OverflowPageAddresses. + * 'blkno' is the block number of the new bitmap page. + * + * All bits in the new bitmap page are set to "1", indicating "in use". */ - -#define INT_MASK ((1 << INT_TO_BIT) -1) - -int32 -_hash_initbitmap(Relation rel, - HashMetaPage metap, - int32 pnum, - int32 nbits, - int32 ndx) +void +_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) { Buffer buf; - BlockNumber blkno; Page pg; HashPageOpaque op; uint32 *freep; - int clearbytes, - clearints; - blkno = OADDR_TO_BLKNO(pnum); + /* initialize the page */ buf = _hash_getbuf(rel, blkno, HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); op = (HashPageOpaque) PageGetSpecialPointer(pg); - op->hasho_oaddr = InvalidOvflAddress; + op->hasho_oaddr = 0; op->hasho_prevblkno = InvalidBlockNumber; op->hasho_nextblkno = InvalidBlockNumber; op->hasho_flag = LH_BITMAP_PAGE; op->hasho_bucket = -1; + /* set all of the bits to 1 */ freep = HashPageGetBitmap(pg); + MemSet((char *) freep, 0xFF, BMPGSZ_BYTE(metap)); - /* set all of the bits above 'nbits' to 1 */ - clearints = ((nbits - 1) >> INT_TO_BIT) + 1; - clearbytes = clearints << INT_TO_BYTE; - MemSet((char *) freep, 0, clearbytes); - MemSet(((char *) freep) + clearbytes, 0xFF, - BMPGSZ_BYTE(metap) - clearbytes); - freep[clearints - 1] = ALL_SET << (nbits & INT_MASK); - - /* bit 0 represents the new bitmap page */ - SETBIT(freep, 0); - - /* metapage already has a write lock */ - metap->hashm_nmaps++; - metap->hashm_mapp[ndx] = blkno; - - /* write out the new bitmap page (releasing its locks) */ + /* write out the new bitmap page (releasing write lock) */ _hash_wrtbuf(rel, buf); - return 0; + /* add the new bitmap page to the metapage's list of bitmaps */ + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + metap->hashm_mapp[metap->hashm_nmaps] = blkno; + + metap->hashm_nmaps++; } @@ -593,14 +567,8 @@ _hash_squeezebucket(Relation rel, rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); - /* - * free this overflow page. the extra _hash_relbuf is because - * _hash_freeovflpage gratuitously returns the next page (we - * want the previous page and will get it ourselves later). - */ - rbuf = _hash_freeovflpage(rel, rbuf); - if (BufferIsValid(rbuf)) - _hash_relbuf(rel, rbuf, HASH_WRITE); + /* free this overflow page */ + _hash_freeovflpage(rel, rbuf); if (rblkno == wblkno) { diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index b6ea8cf31a..e5e77c94b6 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,19 +8,22 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.38 2003/08/04 02:39:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.39 2003/09/01 20:26:34 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque * data at high addresses includes information about the page including - * whether a page is an overflow page or a true bucket, the block - * numbers of the preceding and following pages, and the overflow - * address of the page if it is an overflow page. + * whether a page is an overflow page or a true bucket, the bucket + * number, and the block numbers of the preceding and following pages + * in the same bucket. * * The first page in a hash relation, page zero, is special -- it stores * information describing the hash table; it is referred to as the * "meta page." Pages one and higher store the actual data. * + * There are also bitmap pages, which are not manipulated here; + * see hashovfl.c. + * *------------------------------------------------------------------------- */ @@ -32,10 +35,6 @@ #include "storage/lmgr.h" -static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access); -static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access); -static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket); - /* * We use high-concurrency locking on hash indices. There are two cases in * which we don't do locking. One is when we're building the index. @@ -62,11 +61,15 @@ static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket * the page being deleted, other than an indexscan of our own backend, * which will be taken care of by _hash_adjscans. */ - - #define USELOCKING (!BuildingHash && !IsInitProcessingMode()) +static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access); +static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access); +static void _hash_splitbucket(Relation rel, Buffer metabuf, + Bucket obucket, Bucket nbucket); + + /* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial @@ -80,9 +83,6 @@ _hash_metapinit(Relation rel) Buffer metabuf; Buffer buf; Page pg; - int nbuckets; - uint32 nelem; /* number elements */ - uint32 lg2nelem; /* _hash_log2(nelem) */ uint16 i; /* can't be sharing this with anyone, now... */ @@ -95,63 +95,48 @@ _hash_metapinit(Relation rel) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); pg = BufferGetPage(metabuf); - metap = (HashMetaPage) pg; _hash_pageinit(pg, BufferGetPageSize(metabuf)); - metap->hashm_magic = HASH_MAGIC; - metap->hashm_version = HASH_VERSION; - metap->hashm_nkeys = 0; - metap->hashm_nmaps = 0; - metap->hashm_ffactor = DEFAULT_FFACTOR; - metap->hashm_bsize = BufferGetPageSize(metabuf); - metap->hashm_bshift = _hash_log2(metap->hashm_bsize); - for (i = metap->hashm_bshift; i > 0; --i) - { - if ((1 << i) < (metap->hashm_bsize - - (MAXALIGN(sizeof(PageHeaderData)) + - MAXALIGN(sizeof(HashPageOpaqueData))))) - break; - } - Assert(i); - metap->hashm_bmsize = 1 << i; - metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); - - /* - * Make nelem = 2 rather than 0 so that we end up allocating space for - * the next greater power of two number of buckets. - */ - nelem = 2; - lg2nelem = 1; /* _hash_log2(MAX(nelem, 2)) */ - nbuckets = 2; /* 1 << lg2nelem */ - - MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares)); - MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); - - metap->hashm_spares[lg2nelem] = 2; /* lg2nelem + 1 */ - metap->hashm_spares[lg2nelem + 1] = 2; /* lg2nelem + 1 */ - metap->hashm_ovflpoint = 1; /* lg2nelem */ - metap->hashm_lastfreed = 2; - - metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ - metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ - pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); - pageopaque->hasho_oaddr = InvalidOvflAddress; + pageopaque->hasho_oaddr = 0; pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_bucket = -1; - /* - * First bitmap page is at: splitpoint lg2nelem page offset 1 which - * turns out to be page 3. Couldn't initialize page 3 until we - * created the first two buckets above. - */ - if (_hash_initbitmap(rel, metap, OADDR_OF(lg2nelem, 1), lg2nelem + 1, 0)) - elog(ERROR, "_hash_initbitmap failed"); + metap = (HashMetaPage) pg; - /* all done */ - _hash_wrtnorelbuf(metabuf); + metap->hashm_magic = HASH_MAGIC; + metap->hashm_version = HASH_VERSION; + metap->hashm_ntuples = 0; + metap->hashm_nmaps = 0; + metap->hashm_ffactor = DEFAULT_FFACTOR; + metap->hashm_bsize = BufferGetPageSize(metabuf); + metap->hashm_bshift = _hash_log2(metap->hashm_bsize); + /* page size must be power of 2 */ + Assert(metap->hashm_bsize == (1 << metap->hashm_bshift)); + /* bitmap size is half of page size, to keep it also power of 2 */ + metap->hashm_bmsize = (metap->hashm_bsize >> 1); + Assert(metap->hashm_bsize >= metap->hashm_bmsize + + MAXALIGN(sizeof(PageHeaderData)) + + MAXALIGN(sizeof(HashPageOpaqueData))); + Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); + + metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); + + /* + * We initialize the index with two buckets, 0 and 1, occupying physical + * blocks 1 and 2. The first freespace bitmap page is in block 3. + */ + metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ + metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ + + MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares)); + MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); + + metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ + metap->hashm_ovflpoint = 1; + metap->hashm_firstfree = 0; /* * initialize the first two buckets @@ -162,7 +147,7 @@ _hash_metapinit(Relation rel) pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); - pageopaque->hasho_oaddr = InvalidOvflAddress; + pageopaque->hasho_oaddr = 0; pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_flag = LH_BUCKET_PAGE; @@ -170,7 +155,14 @@ _hash_metapinit(Relation rel) _hash_wrtbuf(rel, buf); } - _hash_relbuf(rel, metabuf, HASH_WRITE); + /* + * Initialize bitmap page. Can't do this until we + * create the first two buckets, else smgr will complain. + */ + _hash_initbitmap(rel, metap, 3); + + /* all done */ + _hash_wrtbuf(rel, metabuf); if (USELOCKING) UnlockRelation(rel, AccessExclusiveLock); @@ -267,30 +259,28 @@ _hash_wrtnorelbuf(Buffer buf) WriteNoReleaseBuffer(buf); } -Page +/* + * _hash_chgbufaccess() -- Change from read to write access or vice versa. + * + * When changing from write to read, we assume the buffer is dirty and tell + * bufmgr it must be written out. + */ +void _hash_chgbufaccess(Relation rel, - Buffer *bufp, + Buffer buf, int from_access, int to_access) { BlockNumber blkno; - blkno = BufferGetBlockNumber(*bufp); + blkno = BufferGetBlockNumber(buf); - switch (from_access) - { - case HASH_WRITE: - _hash_wrtbuf(rel, *bufp); - break; - case HASH_READ: - _hash_relbuf(rel, *bufp, from_access); - break; - default: - elog(ERROR, "unrecognized hash access code: %d", from_access); - break; - } - *bufp = _hash_getbuf(rel, blkno, to_access); - return BufferGetPage(*bufp); + if (from_access == HASH_WRITE) + _hash_wrtnorelbuf(buf); + + _hash_unsetpagelock(rel, blkno, from_access); + + _hash_setpagelock(rel, blkno, to_access); } /* @@ -303,12 +293,14 @@ _hash_pageinit(Page page, Size size) PageInit(page, size, sizeof(HashPageOpaqueData)); } +/* + * _hash_setpagelock() -- Acquire the requested type of lock on a page. + */ static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access) { - if (USELOCKING) { switch (access) @@ -326,12 +318,14 @@ _hash_setpagelock(Relation rel, } } +/* + * _hash_unsetpagelock() -- Release the specified type of lock on a page. + */ static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access) { - if (USELOCKING) { switch (access) @@ -379,24 +373,22 @@ _hash_pagedel(Relation rel, ItemPointer tid) opaque = (HashPageOpaque) PageGetSpecialPointer(page); PageIndexTupleDelete(page, offno); - _hash_wrtnorelbuf(buf); if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE)) - { - buf = _hash_freeovflpage(rel, buf); - if (BufferIsValid(buf)) - _hash_relbuf(rel, buf, HASH_WRITE); - } + _hash_freeovflpage(rel, buf); else - _hash_relbuf(rel, buf, HASH_WRITE); + _hash_wrtbuf(rel, buf); metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage((Page) metap, LH_META_PAGE); - metap->hashm_nkeys--; + metap->hashm_ntuples--; _hash_wrtbuf(rel, metabuf); } +/* + * Expand the hash table by creating one new bucket. + */ void _hash_expandtable(Relation rel, Buffer metabuf) { @@ -408,53 +400,55 @@ _hash_expandtable(Relation rel, Buffer metabuf) metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage((Page) metap, LH_META_PAGE); - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_WRITE); + new_bucket = ++metap->hashm_maxbucket; - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); - old_bucket = (metap->hashm_maxbucket & metap->hashm_lowmask); + old_bucket = (new_bucket & metap->hashm_lowmask); + + if (new_bucket > metap->hashm_highmask) + { + /* Starting a new doubling */ + metap->hashm_lowmask = metap->hashm_highmask; + metap->hashm_highmask = new_bucket | metap->hashm_lowmask; + } /* - * If the split point is increasing (hashm_maxbucket's log base 2 * - * increases), we need to copy the current contents of the spare split - * bucket to the next bucket. + * If the split point is increasing (hashm_maxbucket's log base 2 + * increases), we need to adjust the hashm_spares[] array and + * hashm_ovflpoint so that future overflow pages will be created beyond + * this new batch of bucket pages. + * + * XXX should initialize new bucket pages to prevent out-of-order + * page creation. */ spare_ndx = _hash_log2(metap->hashm_maxbucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { - - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + Assert(spare_ndx == metap->hashm_ovflpoint + 1); metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); } - if (new_bucket > metap->hashm_highmask) - { + _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_READ); - /* Starting a new doubling */ - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); - metap->hashm_lowmask = metap->hashm_highmask; - metap->hashm_highmask = new_bucket | metap->hashm_lowmask; - metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); - - } /* Relocate records to the new bucket */ - _hash_splitpage(rel, metabuf, old_bucket, new_bucket); + _hash_splitbucket(rel, metabuf, old_bucket, new_bucket); } /* - * _hash_splitpage -- split 'obucket' into 'obucket' and 'nbucket' + * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * - * this routine is actually misnamed -- we are splitting a bucket that - * consists of a base bucket page and zero or more overflow (bucket - * chain) pages. + * We are splitting a bucket that consists of a base bucket page and zero + * or more overflow (bucket chain) pages. We must relocate tuples that + * belong in the new bucket, and compress out any free space in the old + * bucket. */ static void -_hash_splitpage(Relation rel, - Buffer metabuf, - Bucket obucket, - Bucket nbucket) +_hash_splitbucket(Relation rel, + Buffer metabuf, + Bucket obucket, + Bucket nbucket) { Bucket bucket; Buffer obuf; @@ -475,7 +469,7 @@ _hash_splitpage(Relation rel, OffsetNumber omaxoffnum; Page opage; Page npage; - TupleDesc itupdesc; + TupleDesc itupdesc = RelationGetDescr(rel); metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage((Page) metap, LH_META_PAGE); @@ -488,13 +482,13 @@ _hash_splitpage(Relation rel, opage = BufferGetPage(obuf); npage = BufferGetPage(nbuf); - /* initialize the new bucket */ + /* initialize the new bucket page */ _hash_pageinit(npage, BufferGetPageSize(nbuf)); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_flag = LH_BUCKET_PAGE; - nopaque->hasho_oaddr = InvalidOvflAddress; + nopaque->hasho_oaddr = 0; nopaque->hasho_bucket = nbucket; _hash_wrtnorelbuf(nbuf); @@ -569,11 +563,11 @@ _hash_splitpage(Relation rel, else { /* - * we're at the end of the bucket chain, so now we're - * really done with everything. before quitting, call - * _hash_squeezebucket to ensure the tuples in the bucket - * (including the overflow pages) are packed as tightly as - * possible. + * We're at the end of the bucket chain, so now we're + * really done with everything. Before quitting, call + * _hash_squeezebucket to ensure the tuples remaining in the + * old bucket (including the overflow pages) are packed as + * tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); @@ -585,8 +579,9 @@ _hash_splitpage(Relation rel, /* hash on the tuple */ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); itup = &(hitem->hash_itup); - itupdesc = RelationGetDescr(rel); datum = index_getattr(itup, 1, itupdesc, &null); + Assert(!null); + bucket = _hash_call(rel, metap, datum); if (bucket == nbucket) @@ -603,7 +598,7 @@ _hash_splitpage(Relation rel, if (PageGetFreeSpace(npage) < itemsz) { - ovflbuf = _hash_addovflpage(rel, &metabuf, nbuf); + ovflbuf = _hash_addovflpage(rel, metabuf, nbuf); _hash_wrtbuf(rel, nbuf); nbuf = ovflbuf; npage = BufferGetPage(nbuf); @@ -638,10 +633,10 @@ _hash_splitpage(Relation rel, if (PageIsEmpty(opage) && (oopaque->hasho_flag & LH_OVERFLOW_PAGE)) { - obuf = _hash_freeovflpage(rel, obuf); + oblkno = _hash_freeovflpage(rel, obuf); /* check that we're not through the bucket chain */ - if (BufferIsInvalid(obuf)) + if (!BlockNumberIsValid(oblkno)) { _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, metap, obucket); @@ -652,9 +647,9 @@ _hash_splitpage(Relation rel, * re-init. again, we're guaranteed that an ovfl page has * at least one tuple. */ + obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); opage = BufferGetPage(obuf); _hash_checkpage(opage, LH_OVERFLOW_PAGE); - oblkno = BufferGetBlockNumber(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); if (PageIsEmpty(opage)) elog(ERROR, "empty hash overflow page %u", oblkno); @@ -668,10 +663,8 @@ _hash_splitpage(Relation rel, * the tuple stays on this page. we didn't move anything, so * we didn't delete anything and therefore we don't have to * change 'omaxoffnum'. - * - * XXX any hash value from [0, nbucket-1] will map to this - * bucket, which doesn't make sense to me. */ + Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 821f8348e8..83aae20c1c 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: hash.h,v 1.49 2003/08/04 02:40:10 momjian Exp $ + * $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -24,43 +24,18 @@ #include "fmgr.h" /* - * An overflow page is a spare page allocated for storing data whose - * bucket doesn't have room to store it. We use overflow pages rather - * than just splitting the bucket because there is a linear order in - * the way we split buckets. In other words, if there isn't enough space - * in the bucket itself, put it in an overflow page. - * - * Overflow page addresses are stored in form: (Splitnumber, Page offset). - * - * A splitnumber is the number of the generation where the table doubles - * in size. The ovflpage's offset within the splitnumber; offsets start - * at 1. - * - * We convert the stored bitmap address into a page address with the - * macro OADDR_OF(S, O) where S is the splitnumber and O is the page - * offset. + * Mapping from hash bucket number to physical block number of bucket's + * starting page. Beware of multiple evaluations of argument! Also notice + * macro's implicit dependency on "metap". */ typedef uint32 Bucket; -typedef bits16 OverflowPageAddress; -typedef uint32 SplitNumber; -typedef uint32 PageOffset; - -/* A valid overflow address will always have a page offset >= 1 */ -#define InvalidOvflAddress 0 - -#define SPLITSHIFT 11 -#define SPLITMASK 0x7FF -#define SPLITNUM(N) ((SplitNumber)(((uint32)(N)) >> SPLITSHIFT)) -#define OPAGENUM(N) ((PageOffset)((N) & SPLITMASK)) -#define OADDR_OF(S,O) ((OverflowPageAddress)((uint32)((uint32)(S) << SPLITSHIFT) + (O))) #define BUCKET_TO_BLKNO(B) \ - ((Bucket) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) -#define OADDR_TO_BLKNO(B) \ - ((BlockNumber) \ - (BUCKET_TO_BLKNO ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B)))); + ((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) /* + * Special space for hash index pages. + * * hasho_flag tells us which type of page we're looking at. For * example, knowing overflow pages from bucket pages is necessary * information when you're deleting tuples from a page. If all the @@ -69,7 +44,6 @@ typedef uint32 PageOffset; * the tuples are deleted from a bucket page, no additional action is * necessary. */ - #define LH_UNUSED_PAGE (0) #define LH_OVERFLOW_PAGE (1 << 0) #define LH_BUCKET_PAGE (1 << 1) @@ -78,9 +52,9 @@ typedef uint32 PageOffset; typedef struct HashPageOpaqueData { - bits16 hasho_flag; /* is this page a bucket or ovfl */ + bits16 hasho_flag; /* page type code, see above */ Bucket hasho_bucket; /* bucket number this pg belongs to */ - OverflowPageAddress hasho_oaddr; /* ovfl address of this ovfl pg */ + bits16 hasho_oaddr; /* no longer used; delete someday */ BlockNumber hasho_nextblkno; /* next ovfl blkno */ BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */ } HashPageOpaqueData; @@ -91,10 +65,8 @@ typedef HashPageOpaqueData *HashPageOpaque; * ScanOpaqueData is used to remember which buffers we're currently * examining in the scan. We keep these buffers locked and pinned and * recorded in the opaque entry of the scan in order to avoid doing a - * ReadBuffer() for every tuple in the index. This avoids semop() calls, - * which are expensive. + * ReadBuffer() for every tuple in the index. */ - typedef struct HashScanOpaqueData { Buffer hashso_curbuf; @@ -113,60 +85,55 @@ typedef HashScanOpaqueData *HashScanOpaque; #define HASH_VERSION 0 /* - * NCACHED is used to set the array sizeof spares[] & bitmaps[]. + * Spares[] holds the number of overflow pages currently allocated at or + * before a certain splitpoint. For example, if spares[3] = 7 then there are + * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The + * value in spares[ovflpoint] increases as overflow pages are added at the + * end of the index. Once ovflpoint increases (ie, we have actually allocated + * the bucket pages belonging to that splitpoint) the number of spares at the + * prior splitpoint cannot change anymore. * - * Spares[] is used to hold the number overflow pages currently - * allocated at a certain splitpoint. For example, if spares[3] = 7 - * then there are a maximum of 7 ovflpages available at splitpoint 3. - * The value in spares[] will change as ovflpages are added within - * a splitpoint. + * ovflpages that have been recycled for reuse can be found by looking at + * bitmaps that are stored within ovflpages dedicated for the purpose. + * The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the + * number of currently existing bitmaps. * - * Within a splitpoint, one can find which ovflpages are available and - * which are used by looking at a bitmaps that are stored on the ovfl - * pages themselves. There is at least one bitmap for every splitpoint's - * ovflpages. Bitmaps[] contains the ovflpage addresses of the ovflpages - * that hold the ovflpage bitmaps. - * - * The reason that the size is restricted to NCACHED (32) is because - * the bitmaps are 16 bits: upper 5 represent the splitpoint, lower 11 - * indicate the page number within the splitpoint. Since there are - * only 5 bits to store the splitpoint, there can only be 32 splitpoints. - * Both spares[] and bitmaps[] use splitpoints as there indices, so there - * can only be 32 of them. + * The limitation on the size of spares[] comes from the fact that there's + * no point in having more than 2^32 buckets with only uint32 hashcodes. + * There is no particularly good reason for bitmaps[] to be the same size, + * but we're stuck with that until we want to force an initdb. (With 8K + * block size, 32 bitmaps limit us to 8 Gb of overflow space...) */ - -#define NCACHED 32 - +#define HASH_MAX_SPLITPOINTS 32 +#define HASH_MAX_BITMAPS 32 typedef struct HashMetaPageData { PageHeaderData hashm_phdr; /* pad for page header (do not use) */ uint32 hashm_magic; /* magic no. for hash tables */ uint32 hashm_version; /* version ID */ - uint32 hashm_nkeys; /* number of keys stored in the table */ - uint16 hashm_ffactor; /* fill factor */ - uint16 hashm_bsize; /* bucket size (bytes) - must be a power + uint32 hashm_ntuples; /* number of tuples stored in the table */ + uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */ + uint16 hashm_bsize; /* index page size (bytes) - must be a power * of 2 */ - uint16 hashm_bshift; /* bucket shift */ - uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a - * power of 2 */ + uint16 hashm_bshift; /* log2(bsize) */ + uint16 hashm_bmsize; /* bitmap array size (bytes) - must be + * exactly half of hashm_bsize */ uint32 hashm_maxbucket; /* ID of maximum bucket in use */ uint32 hashm_highmask; /* mask to modulo into entire table */ uint32 hashm_lowmask; /* mask to modulo into lower half of table */ - uint32 hashm_ovflpoint;/* pageno. from which ovflpgs being + uint32 hashm_ovflpoint;/* splitpoint from which ovflpgs being * allocated */ - uint32 hashm_lastfreed; /* last ovflpage freed */ - uint32 hashm_nmaps; /* Initial number of bitmaps */ - uint32 hashm_spares[NCACHED]; /* spare pages available at - * splitpoints */ - BlockNumber hashm_mapp[NCACHED]; /* blknumbers of ovfl page maps */ + uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */ + uint32 hashm_nmaps; /* number of bitmap pages */ + uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before + * each splitpoint */ + BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ RegProcedure hashm_procid; /* hash procedure id from pg_proc */ } HashMetaPageData; typedef HashMetaPageData *HashMetaPage; -extern bool BuildingHash; - typedef struct HashItemData { IndexTupleData hash_itup; @@ -178,31 +145,33 @@ typedef HashItemData *HashItem; * Constants */ #define DEFAULT_FFACTOR 300 -#define SPLITMAX 8 #define BYTE_TO_BIT 3 /* 2^3 bits/byte */ -#define INT_TO_BYTE 2 /* 2^2 bytes/int */ -#define INT_TO_BIT 5 /* 2^5 bits/int */ #define ALL_SET ((uint32) ~0) /* - * bitmap pages do not contain tuples. they do contain the standard + * Bitmap pages do not contain tuples. They do contain the standard * page headers and trailers; however, everything in between is a - * giant bit array. the number of bits that fit on a page obviously - * depends on the page size and the header/trailer overhead. + * giant bit array. The number of bits that fit on a page obviously + * depends on the page size and the header/trailer overhead. In the + * present implementation, we use exactly half of a page for bitmap, + * so that we have a power-of-2 bits per page. + * + * The fact that the metapage has separate bsize and bmsize fields, + * but only one bshift field, is a design error that ought to be fixed. */ #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize) #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) +#define BMPG_SHIFT(metap) ((metap)->hashm_bshift - 1 + BYTE_TO_BIT) +#define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1) #define HashPageGetBitmap(pg) \ ((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData)))) /* - * The number of bits in an ovflpage bitmap which - * tells which ovflpages are empty versus in use (NOT the number of - * bits in an overflow page *address* bitmap). + * The number of bits in an ovflpage bitmap word. */ -#define BITS_PER_MAP 32 /* Number of bits in ovflpage bitmap */ +#define BITS_PER_MAP 32 /* Number of bits in uint32 */ -/* Given the address of the beginning of a big map, clear/set the nth bit */ +/* Given the address of the beginning of a bit map, clear/set the nth bit */ #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) @@ -213,18 +182,9 @@ typedef HashItemData *HashItem; #define HASH_READ 0 #define HASH_WRITE 1 -/* - * In general, the hash code tries to localize its knowledge about page - * layout to a couple of routines. However, we need a special value to - * indicate "no page number" in those places where we expect page numbers. - */ - -#define P_NONE 0 - /* * Strategy number. There's only one valid strategy for hashing: equality. */ - #define HTEqualStrategyNumber 1 #define HTMaxStrategyNumber 1 @@ -233,9 +193,11 @@ typedef HashItemData *HashItem; * us with an amproc procudure for hashing a key of the new type. * Since we only have one such proc in amproc, it's number 1. */ - #define HASHPROC 1 + +extern bool BuildingHash; + /* public routines */ extern Datum hashbuild(PG_FUNCTION_ARGS); @@ -276,36 +238,32 @@ extern Datum hash_any(register const unsigned char *k, register int keylen); /* hashinsert.c */ extern InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem); - /* hashovfl.c */ -extern Buffer _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf); -extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf); -extern int32 _hash_initbitmap(Relation rel, HashMetaPage metap, int32 pnum, - int32 nbits, int32 ndx); +extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); +extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf); +extern void _hash_initbitmap(Relation rel, HashMetaPage metap, + BlockNumber blkno); extern void _hash_squeezebucket(Relation rel, HashMetaPage metap, Bucket bucket); - /* hashpage.c */ extern void _hash_metapinit(Relation rel); extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access); extern void _hash_relbuf(Relation rel, Buffer buf, int access); extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_wrtnorelbuf(Buffer buf); -extern Page _hash_chgbufaccess(Relation rel, Buffer *bufp, int from_access, +extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); extern void _hash_pageinit(Page page, Size size); extern void _hash_pagedel(Relation rel, ItemPointer tid); extern void _hash_expandtable(Relation rel, Buffer metabuf); - /* hashscan.c */ extern void _hash_regscan(IndexScanDesc scan); extern void _hash_dropscan(IndexScanDesc scan); extern void _hash_adjscans(Relation rel, ItemPointer tid); extern void AtEOXact_hash(void); - /* hashsearch.c */ extern void _hash_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, HashMetaPage metap); @@ -314,7 +272,6 @@ extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf); - /* hashutil.c */ extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup); extern void _hash_freeskey(ScanKey skey); @@ -324,7 +281,6 @@ extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key); extern uint32 _hash_log2(uint32 num); extern void _hash_checkpage(Page page, int flags); - /* hash.c */ extern void hash_redo(XLogRecPtr lsn, XLogRecord *record); extern void hash_undo(XLogRecPtr lsn, XLogRecord *record);