diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c3db87a187..2522cca46c 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.62 2000/03/17 02:36:05 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.63 2000/04/09 04:43:16 tgl Exp $ * * NOTES * Transaction aborts can now occur two ways: @@ -642,7 +642,7 @@ RecordTransactionCommit() { FlushBufferPool(); if (leak) - ResetBufferPool(); + ResetBufferPool(true); /* * have the transaction access methods record the status @@ -658,7 +658,7 @@ RecordTransactionCommit() } if (leak) - ResetBufferPool(); + ResetBufferPool(true); } @@ -759,7 +759,10 @@ RecordTransactionAbort() if (SharedBufferChanged && !TransactionIdDidCommit(xid)) TransactionIdAbort(xid); - ResetBufferPool(); + /* + * Tell bufmgr and smgr to release resources. + */ + ResetBufferPool(false); /* false -> is abort */ } /* -------------------------------- diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 1072877713..5425387b3f 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.30 2000/01/26 05:56:10 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.31 2000/04/09 04:43:15 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -23,25 +23,87 @@ #include "utils/syscache.h" /* - * relpath - path to the relation - * Perhaps this should be in-line code in relopen(). + * relpath - construct path to a relation's file + * + * Note that this only works with relations that are visible to the current + * backend, ie, either in the current database or shared system relations. + * + * Result is a palloc'd string. */ char * relpath(const char *relname) { char *path; - size_t bufsize = 0; if (IsSharedSystemRelationName(relname)) { - bufsize = strlen(DataDir) + sizeof(NameData) + 2; + /* Shared system relations live in DataDir */ + size_t bufsize = strlen(DataDir) + sizeof(NameData) + 2; + path = (char *) palloc(bufsize); - snprintf(path, bufsize, "%s/%s", DataDir, relname); + snprintf(path, bufsize, "%s%c%s", DataDir, SEP_CHAR, relname); return path; } + /* + * If it is in the current database, assume it is in current working + * directory. NB: this does not work during bootstrap! + */ return pstrdup(relname); } +/* + * relpath_blind - construct path to a relation's file + * + * Construct the path using only the info available to smgrblindwrt, + * namely the names and OIDs of the database and relation. (Shared system + * relations are identified with dbid = 0.) Note that we may have to + * access a relation belonging to a different database! + * + * Result is a palloc'd string. + */ + +char * +relpath_blind(const char *dbname, const char *relname, + Oid dbid, Oid relid) +{ + char *path; + + if (dbid == (Oid) 0) + { + /* Shared system relations live in DataDir */ + path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2); + sprintf(path, "%s%c%s", DataDir, SEP_CHAR, relname); + } + else if (dbid == MyDatabaseId) + { + /* XXX why is this inconsistent with relpath() ? */ + path = (char *) palloc(strlen(DatabasePath) + sizeof(NameData) + 2); + sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relname); + } + else + { + /* this is work around only !!! */ + char dbpathtmp[MAXPGPATH]; + Oid id; + char *dbpath; + + GetRawDatabaseInfo(dbname, &id, dbpathtmp); + + if (id != dbid) + elog(FATAL, "relpath_blind: oid of db %s is not %u", + dbname, dbid); + dbpath = ExpandDatabasePath(dbpathtmp); + if (dbpath == NULL) + elog(FATAL, "relpath_blind: can't expand path for db %s", + dbname); + path = (char *) palloc(strlen(dbpath) + sizeof(NameData) + 2); + sprintf(path, "%s%c%s", dbpath, SEP_CHAR, relname); + pfree(dbpath); + } + return path; +} + + /* * IsSystemRelationName * True iff name is the name of a system catalog relation. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 0a58033c97..4f1d033bc4 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.32 2000/01/26 05:56:50 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.33 2000/04/09 04:43:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -65,9 +65,11 @@ long *NWaitIOBackendP; extern IpcSemaphoreId WaitIOSemId; long *PrivateRefCount; /* also used in freelist.c */ -bits8 *BufferLocks; /* */ -long *CommitInfoNeedsSave;/* to write buffers where we have filled - * in t_infomask */ +bits8 *BufferLocks; /* flag bits showing locks I have set */ +BufferTag *BufferTagLastDirtied; /* tag buffer had when last dirtied by me */ +BufferBlindId *BufferBlindLastDirtied; /* and its BlindId too */ +bool *BufferDirtiedByMe; /* T if buf has been dirtied in cur xact */ + /* * Data Structures: @@ -247,7 +249,9 @@ InitBufferPool(IPCKey key) #endif PrivateRefCount = (long *) calloc(NBuffers, sizeof(long)); BufferLocks = (bits8 *) calloc(NBuffers, sizeof(bits8)); - CommitInfoNeedsSave = (long *) calloc(NBuffers, sizeof(long)); + BufferTagLastDirtied = (BufferTag *) calloc(NBuffers, sizeof(BufferTag)); + BufferBlindLastDirtied = (BufferBlindId *) calloc(NBuffers, sizeof(BufferBlindId)); + BufferDirtiedByMe = (bool *) calloc(NBuffers, sizeof(bool)); } /* ----------------------------------------------------- diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index acc719ca4b..0887f3d1ec 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.77 2000/03/31 02:43:31 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.78 2000/04/09 04:43:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -94,8 +94,10 @@ static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, bool bufferLockHeld); static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr, bool bufferLockHeld); +static void SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr); +static void ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr); static void BufferSync(void); -static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld); +static int BufferReplace(BufferDesc *bufHdr); void PrintBufferDescs(void); /* --------------------------------------------------- @@ -176,7 +178,7 @@ is_userbuffer(Buffer buffer) { BufferDesc *buf = &BufferDescriptors[buffer - 1]; - if (IsSystemRelationName(buf->sb_relname)) + if (IsSystemRelationName(buf->blind.relname)) return false; return true; } @@ -199,7 +201,7 @@ ReadBuffer_Debug(char *file, fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } return buffer; @@ -390,22 +392,21 @@ BufferAlloc(Relation reln, * If there's no IO for the buffer and the buffer * is BROKEN,it should be read again. So start a * new buffer IO here. - - * - * wierd race condition: - * - * We were waiting for someone else to read the buffer. While - * we were waiting, the reader boof'd in some way, so the - * contents of the buffer are still invalid. By saying - * that we didn't find it, we can make the caller - * reinitialize the buffer. If two processes are waiting - * for this block, both will read the block. The second - * one to finish may overwrite any updates made by the - * first. (Assume higher level synchronization prevents - * this from happening). - * - * This is never going to happen, don't worry about it. - */ + * + * wierd race condition: + * + * We were waiting for someone else to read the buffer. While + * we were waiting, the reader boof'd in some way, so the + * contents of the buffer are still invalid. By saying + * that we didn't find it, we can make the caller + * reinitialize the buffer. If two processes are waiting + * for this block, both will read the block. The second + * one to finish may overwrite any updates made by the + * first. (Assume higher level synchronization prevents + * this from happening). + * + * This is never going to happen, don't worry about it. + */ *foundPtr = FALSE; } #ifdef BMTRACE @@ -465,33 +466,24 @@ BufferAlloc(Relation reln, * in WaitIO until we're done. */ inProgress = TRUE; -#ifdef HAS_TEST_AND_SET /* * All code paths that acquire this lock pin the buffer first; * since no one had it pinned (it just came off the free * list), no one else can have this lock. */ -#endif /* HAS_TEST_AND_SET */ StartBufferIO(buf, false); /* * Write the buffer out, being careful to release BufMgrLock * before starting the I/O. - * - * This #ifndef is here because a few extra semops REALLY kill - * you on machines that don't have spinlocks. If you don't - * operate with much concurrency, well... */ - smok = BufferReplace(buf, true); -#ifndef OPTIMIZE_SINGLE - SpinAcquire(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ + smok = BufferReplace(buf); if (smok == FALSE) { elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s", - buf->tag.blockNum, buf->sb_dbname, buf->sb_relname); + buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); inProgress = FALSE; buf->flags |= BM_IO_ERROR; buf->flags &= ~BM_IO_IN_PROGRESS; @@ -516,7 +508,7 @@ BufferAlloc(Relation reln, if (buf->flags & BM_JUST_DIRTIED) { elog(FATAL, "BufferAlloc: content of block %u (%s) changed while flushing", - buf->tag.blockNum, buf->sb_relname); + buf->tag.blockNum, buf->blind.relname); } else buf->flags &= ~BM_DIRTY; @@ -562,6 +554,7 @@ BufferAlloc(Relation reln, */ if (buf != NULL) { + buf->flags &= ~BM_IO_IN_PROGRESS; TerminateBufferIO(buf); /* give up the buffer since we don't need it any more */ PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; @@ -572,7 +565,6 @@ BufferAlloc(Relation reln, AddBufferToFreelist(buf); buf->flags |= BM_FREE; } - buf->flags &= ~BM_IO_IN_PROGRESS; } PinBuffer(buf2); @@ -619,8 +611,8 @@ BufferAlloc(Relation reln, } /* record the database name and relation name for this buffer */ - strcpy(buf->sb_relname, RelationGetPhysicalRelationName(reln)); - strcpy(buf->sb_dbname, DatabaseName); + strcpy(buf->blind.dbname, DatabaseName); + strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln)); INIT_BUFFERTAG(&(buf->tag), reln, blockNum); if (!BufTableInsert(buf)) @@ -683,9 +675,9 @@ WriteBuffer(Buffer buffer) SpinAcquire(BufMgrLock); Assert(bufHdr->refcount > 0); bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + SetBufferDirtiedByMe(buffer, bufHdr); UnpinBuffer(bufHdr); SpinRelease(BufMgrLock); - CommitInfoNeedsSave[buffer - 1] = 0; return TRUE; } @@ -702,7 +694,7 @@ WriteBuffer_Debug(char *file, int line, Buffer buffer) buf = &BufferDescriptors[buffer - 1]; fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -767,8 +759,9 @@ DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest) * * 'buffer' is known to be dirty/pinned, so there should not be a * problem reading the BufferDesc members without the BufMgrLock - * (nobody should be able to change tags, flags, etc. out from under - * us). Unpin if 'release' is TRUE. + * (nobody should be able to change tags out from under us). + * + * Unpin if 'release' is TRUE. */ int FlushBuffer(Buffer buffer, bool release) @@ -784,6 +777,8 @@ FlushBuffer(Buffer buffer, bool release) if (BAD_BUFFER_ID(buffer)) return STATUS_ERROR; + Assert(PrivateRefCount[buffer - 1] > 0); /* else caller didn't pin */ + bufHdr = &BufferDescriptors[buffer - 1]; bufdb = bufHdr->tag.relId.dbId; @@ -809,7 +804,7 @@ FlushBuffer(Buffer buffer, bool release) if (status == SM_FAIL) { elog(ERROR, "FlushBuffer: cannot flush block %u of the relation %s", - bufHdr->tag.blockNum, bufHdr->sb_relname); + bufHdr->tag.blockNum, bufHdr->blind.relname); return STATUS_ERROR; } BufferFlushCount++; @@ -820,19 +815,21 @@ FlushBuffer(Buffer buffer, bool release) /* * If this buffer was marked by someone as DIRTY while we were - * flushing it out we must not clear DIRTY flag - vadim 01/17/97 + * flushing it out we must not clear shared DIRTY flag - vadim 01/17/97 + * + * ... but we can clear BufferDirtiedByMe anyway - tgl 3/31/00 */ if (bufHdr->flags & BM_JUST_DIRTIED) { elog(NOTICE, "FlushBuffer: content of block %u (%s) changed while flushing", - bufHdr->tag.blockNum, bufHdr->sb_relname); + bufHdr->tag.blockNum, bufHdr->blind.relname); } else bufHdr->flags &= ~BM_DIRTY; + ClearBufferDirtiedByMe(buffer, bufHdr); if (release) UnpinBuffer(bufHdr); SpinRelease(BufMgrLock); - CommitInfoNeedsSave[buffer - 1] = 0; return STATUS_OK; } @@ -857,9 +854,10 @@ WriteNoReleaseBuffer(Buffer buffer) SharedBufferChanged = true; SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + SetBufferDirtiedByMe(buffer, bufHdr); SpinRelease(BufMgrLock); - CommitInfoNeedsSave[buffer - 1] = 0; return STATUS_OK; } @@ -901,11 +899,6 @@ ReleaseAndReadBuffer(Buffer buffer, AddBufferToFreelist(bufHdr); bufHdr->flags |= BM_FREE; } - if (CommitInfoNeedsSave[buffer - 1]) - { - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - CommitInfoNeedsSave[buffer - 1] = 0; - } retbuf = ReadBufferWithBufferLock(relation, blockNum, true); return retbuf; } @@ -915,13 +908,120 @@ ReleaseAndReadBuffer(Buffer buffer, return ReadBuffer(relation, blockNum); } +/* + * SetBufferDirtiedByMe -- mark a shared buffer as being dirtied by this xact + * + * This flag essentially remembers that we need to write and fsync this buffer + * before we can commit the transaction. The write might end up getting done + * by another backend, but we must do the fsync ourselves (else we could + * commit before the data actually reaches disk). We do not issue fsync + * instantly upon write; the storage manager keeps track of which files need + * to be fsync'd before commit can occur. A key aspect of this data structure + * is that we will be able to notify the storage manager that an fsync is + * needed even after another backend has done the physical write and replaced + * the buffer contents with something else! + * + * NB: we must be holding the bufmgr lock at entry, and the buffer must be + * pinned so that no other backend can take it away from us. + */ +static void +SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) +{ + BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1]; + Relation reln; + int status; + + /* + * If the flag is already set, check to see whether the buffertag is + * the same. If not, some other backend already wrote the buffer data + * that we dirtied. We must tell the storage manager to make an fsync + * pending on that file before we can overwrite the old tag value. + */ + if (BufferDirtiedByMe[buffer - 1]) + { + if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId && + bufHdr->tag.relId.relId == tagLastDirtied->relId.relId && + bufHdr->tag.blockNum == tagLastDirtied->blockNum) + return; /* Same tag already dirtied, so no work */ + +#ifndef OPTIMIZE_SINGLE + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + reln = RelationIdCacheGetRelation(tagLastDirtied->relId.relId); + + if (reln == (Relation) NULL) + { + status = smgrblindmarkdirty(DEFAULT_SMGR, + BufferBlindLastDirtied[buffer - 1].dbname, + BufferBlindLastDirtied[buffer - 1].relname, + tagLastDirtied->relId.dbId, + tagLastDirtied->relId.relId, + tagLastDirtied->blockNum); + } + else + { + status = smgrmarkdirty(DEFAULT_SMGR, reln, + tagLastDirtied->blockNum); + /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + RelationDecrementReferenceCount(reln); + } + if (status == SM_FAIL) + { + elog(ERROR, "SetBufferDirtiedByMe: cannot mark %u for %s", + tagLastDirtied->blockNum, + BufferBlindLastDirtied[buffer - 1].relname); + } + +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + } + + *tagLastDirtied = bufHdr->tag; + BufferBlindLastDirtied[buffer - 1] = bufHdr->blind; + BufferDirtiedByMe[buffer - 1] = true; +} + +/* + * ClearBufferDirtiedByMe -- mark a shared buffer as no longer needing fsync + * + * If we write out a buffer ourselves, then the storage manager will set its + * needs-fsync flag for that file automatically, and so we can clear our own + * flag that says it needs to be done later. + * + * NB: we must be holding the bufmgr lock at entry. + */ +static void +ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) +{ + BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1]; + + /* + * Do *not* clear the flag if it refers to some other buffertag than + * the data we just wrote. This is unlikely, but possible if some + * other backend replaced the buffer contents since we set our flag. + */ + if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId && + bufHdr->tag.relId.relId == tagLastDirtied->relId.relId && + bufHdr->tag.blockNum == tagLastDirtied->blockNum) + { + BufferDirtiedByMe[buffer - 1] = false; + } +} + /* * BufferSync -- Flush all dirty buffers in the pool. * - * This is called at transaction commit time. It does the wrong thing, - * right now. We should flush only our own changes to stable storage, - * and we should obey the lock protocol on the buffer manager metadata - * as we do it. Also, we need to be sure that no other transaction is + * This is called at transaction commit time. We find all buffers + * that have been dirtied by the current xact and flush them to disk. + * We do *not* flush dirty buffers that have been dirtied by other xacts. + * (This is a substantial change from pre-7.0 behavior.) + * + * OLD COMMENTS (do these still apply?) + * + * Also, we need to be sure that no other transaction is * modifying the page as we flush it. This is only a problem for objects * that use a non-two-phase locking protocol, like btree indices. For * those objects, we would like to set a write lock for the duration of @@ -936,21 +1036,49 @@ static void BufferSync() { int i; - Oid bufdb; - Oid bufrel; - Relation reln; BufferDesc *bufHdr; int status; + Relation reln; + bool didwrite; - SpinAcquire(BufMgrLock); for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { + /* Ignore buffers that were not dirtied by me */ + if (! BufferDirtiedByMe[i]) + continue; + + SpinAcquire(BufMgrLock); + + /* + * We only need to write if the buffer is still dirty and still + * contains the same disk page that it contained when we dirtied it. + * Otherwise, someone else has already written our changes for us, + * and we need only fsync. + * + * (NOTE: it's still possible to do an unnecessary write, if other + * xacts have written and then re-dirtied the page since our last + * change to it. But that should be pretty uncommon, and there's + * no easy way to detect it anyway.) + */ + reln = NULL; + didwrite = false; if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { + Oid bufdb; + Oid bufrel; + bufdb = bufHdr->tag.relId.dbId; bufrel = bufHdr->tag.relId.relId; - if (bufdb == MyDatabaseId || bufdb == (Oid) 0) + if (bufdb == BufferTagLastDirtied[i].relId.dbId && + bufrel == BufferTagLastDirtied[i].relId.relId && + bufHdr->tag.blockNum == BufferTagLastDirtied[i].blockNum) { + /* + * Try to find relation for buf. This could fail, if the + * rel has been flushed from the relcache since we dirtied + * the page. That should be uncommon, so paying the extra + * cost of a blind write when it happens seems OK. + */ reln = RelationIdCacheGetRelation(bufrel); /* @@ -970,74 +1098,114 @@ BufferSync() if (bufHdr->flags & BM_IO_ERROR) { elog(ERROR, "BufferSync: write error %u for %s", - bufHdr->tag.blockNum, bufHdr->sb_relname); + bufHdr->tag.blockNum, bufHdr->blind.relname); } - /* drop refcnt from RelationIdCacheGetRelation */ - if (reln != (Relation) NULL) - RelationDecrementReferenceCount(reln); - continue; - } - - /* - * To check if block content changed while flushing (see - * below). - vadim 01/17/97 - */ - WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */ - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - /* - * If we didn't have the reldesc in our local cache, flush - * this page out using the 'blind write' storage manager - * routine. If we did find it, use the standard - * interface. - */ - -#ifndef OPTIMIZE_SINGLE - SpinRelease(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ - if (reln == (Relation) NULL) - { - status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname, - bufHdr->sb_relname, bufdb, bufrel, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); } else { - status = smgrwrite(DEFAULT_SMGR, reln, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } + /* + * To check if block content changed while flushing (see + * below). - vadim 01/17/97 + */ + WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */ + bufHdr->flags &= ~BM_JUST_DIRTIED; + StartBufferIO(bufHdr, false); /* output IO start */ + + /* + * If we didn't have the reldesc in our local cache, write + * this page out using the 'blind write' storage manager + * routine. If we did find it, use the standard + * interface. + */ #ifndef OPTIMIZE_SINGLE - SpinAcquire(BufMgrLock); + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + if (reln == (Relation) NULL) + { + status = smgrblindwrt(DEFAULT_SMGR, + bufHdr->blind.dbname, + bufHdr->blind.relname, + bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + else + { + status = smgrwrite(DEFAULT_SMGR, reln, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); #endif /* OPTIMIZE_SINGLE */ - UnpinBuffer(bufHdr); - if (status == SM_FAIL) - { - bufHdr->flags |= BM_IO_ERROR; - elog(ERROR, "BufferSync: cannot write %u for %s", - bufHdr->tag.blockNum, bufHdr->sb_relname); - } - bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ - TerminateBufferIO(bufHdr); /* Sync IO finished */ - BufferFlushCount++; + UnpinBuffer(bufHdr); + if (status == SM_FAIL) + { + bufHdr->flags |= BM_IO_ERROR; + elog(ERROR, "BufferSync: cannot write %u for %s", + bufHdr->tag.blockNum, bufHdr->blind.relname); + } + bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ + TerminateBufferIO(bufHdr); /* Sync IO finished */ + BufferFlushCount++; + didwrite = true; - /* - * If this buffer was marked by someone as DIRTY while we - * were flushing it out we must not clear DIRTY flag - - * vadim 01/17/97 - */ - if (!(bufHdr->flags & BM_JUST_DIRTIED)) - bufHdr->flags &= ~BM_DIRTY; - /* drop refcnt from RelationIdCacheGetRelation */ + /* + * If this buffer was marked by someone as DIRTY while we + * were flushing it out we must not clear DIRTY flag - + * vadim 01/17/97 + * + * but it is OK to clear BufferDirtiedByMe - tgl 3/31/00 + */ + if (!(bufHdr->flags & BM_JUST_DIRTIED)) + bufHdr->flags &= ~BM_DIRTY; + } + + /* drop refcnt obtained by RelationIdCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); } } + + /* + * If we did not write the buffer (because someone else did), + * we must still fsync the file containing it, to ensure that the + * write is down to disk before we commit. + */ + if (! didwrite) + { +#ifndef OPTIMIZE_SINGLE + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + reln = RelationIdCacheGetRelation(BufferTagLastDirtied[i].relId.relId); + if (reln == (Relation) NULL) + { + status = smgrblindmarkdirty(DEFAULT_SMGR, + BufferBlindLastDirtied[i].dbname, + BufferBlindLastDirtied[i].relname, + BufferTagLastDirtied[i].relId.dbId, + BufferTagLastDirtied[i].relId.relId, + BufferTagLastDirtied[i].blockNum); + } + else + { + status = smgrmarkdirty(DEFAULT_SMGR, reln, + BufferTagLastDirtied[i].blockNum); + /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + RelationDecrementReferenceCount(reln); + + } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + } + + BufferDirtiedByMe[i] = false; + + SpinRelease(BufMgrLock); } - SpinRelease(BufMgrLock); LocalBufferSync(); } @@ -1166,13 +1334,19 @@ ResetBufferUsage() /* ---------------------------------------------- * ResetBufferPool * - * this routine is supposed to be called when a transaction aborts. + * This routine is supposed to be called when a transaction aborts. * it will release all the buffer pins held by the transaction. + * Currently, we also call it during commit if BufferPoolCheckLeak + * detected a problem --- in that case, isCommit is TRUE, and we + * only clean up buffer pin counts. + * + * During abort, we also forget any pending fsync requests. Dirtied buffers + * will still get written, eventually, but there will be no fsync for them. * * ---------------------------------------------- */ void -ResetBufferPool() +ResetBufferPool(bool isCommit) { int i; @@ -1193,10 +1367,15 @@ ResetBufferPool() SpinRelease(BufMgrLock); } PrivateRefCount[i] = 0; - CommitInfoNeedsSave[i] = 0; + + if (! isCommit) + BufferDirtiedByMe[i] = false; } ResetLocalBufferPool(); + + if (! isCommit) + smgrabort(); } /* ----------------------------------------------- @@ -1222,7 +1401,7 @@ BufferPoolCheckLeak() "Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \ relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)", i - 1, buf->freeNext, buf->freePrev, - buf->sb_relname, buf->tag.blockNum, buf->flags, + buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i - 1]); result = 1; } @@ -1306,25 +1485,25 @@ BufferGetRelation(Buffer buffer) /* * BufferReplace * - * Flush the buffer corresponding to 'bufHdr' + * Write out the buffer corresponding to 'bufHdr' * + * This routine used to flush the data to disk (ie, force immediate fsync) + * but that's no longer necessary because BufferSync is smarter than before. + * + * BufMgrLock must be held at entry, and the buffer must be pinned. */ static int -BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) +BufferReplace(BufferDesc *bufHdr) { Relation reln; Oid bufdb, bufrel; int status; - if (!bufferLockHeld) - SpinAcquire(BufMgrLock); - /* * first try to find the reldesc in the cache, if no luck, don't * bother to build the reldesc from scratch, just do a blind write. */ - bufdb = bufHdr->tag.relId.dbId; bufrel = bufHdr->tag.relId.relId; @@ -1336,22 +1515,27 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) /* To check if block content changed while flushing. - vadim 01/17/97 */ bufHdr->flags &= ~BM_JUST_DIRTIED; +#ifndef OPTIMIZE_SINGLE SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ if (reln != (Relation) NULL) { - status = smgrflush(DEFAULT_SMGR, reln, bufHdr->tag.blockNum, + status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum, (char *) MAKE_PTR(bufHdr->data)); } else { - /* blind write always flushes */ - status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname, - bufHdr->sb_relname, bufdb, bufrel, + status = smgrblindwrt(DEFAULT_SMGR, bufHdr->blind.dbname, + bufHdr->blind.relname, bufdb, bufrel, bufHdr->tag.blockNum, (char *) MAKE_PTR(bufHdr->data)); } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); @@ -1359,6 +1543,11 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) if (status == SM_FAIL) return FALSE; + /* If we had marked this buffer as needing to be fsync'd, we can forget + * about that, because it's now the storage manager's responsibility. + */ + ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr); + BufferFlushCount++; return TRUE; @@ -1440,7 +1629,7 @@ ReleaseRelationBuffers(Relation rel) } /* Now we can do what we came for */ buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED); - CommitInfoNeedsSave[i - 1] = 0; + ClearBufferDirtiedByMe(i, buf); /* * Release any refcount we may have. * @@ -1502,6 +1691,7 @@ DropBuffers(Oid dbid) } /* Now we can do what we came for */ buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED); + ClearBufferDirtiedByMe(i, buf); /* * The thing should be free, if caller has checked that * no backends are running in that database. @@ -1533,7 +1723,7 @@ PrintBufferDescs() elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ blockNum=%d, flags=0x%x, refcount=%d %ld)", i, buf->freeNext, buf->freePrev, - buf->sb_relname, buf->tag.blockNum, buf->flags, + buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); } SpinRelease(BufMgrLock); @@ -1544,7 +1734,7 @@ blockNum=%d, flags=0x%x, refcount=%d %ld)", for (i = 0; i < NBuffers; ++i, ++buf) { printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", - i, buf->sb_relname, buf->tag.blockNum, + i, buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); } } @@ -1562,7 +1752,7 @@ PrintPinnedBufs() if (PrivateRefCount[i] > 0) elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ blockNum=%d, flags=0x%x, refcount=%d %ld)\n", - i, buf->freeNext, buf->freePrev, buf->sb_relname, + i, buf->freeNext, buf->freePrev, buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); } @@ -1601,33 +1791,42 @@ BufferPoolBlowaway() * FlushRelationBuffers * * This function removes from the buffer pool all pages of a relation - * that have blocknumber >= specified block. If doFlush is true, - * dirty buffers are written out --- otherwise it's an error for any - * of the buffers to be dirty. + * that have blocknumber >= specified block. Pages that are dirty are + * written out first. If expectDirty is false, a notice is emitted + * warning of dirty buffers, but we proceed anyway. An error code is + * returned if we fail to dump a dirty buffer or if we find one of + * the target pages is pinned into the cache. * * This is used by VACUUM before truncating the relation to the given - * number of blocks. For VACUUM, we pass doFlush = false since it would - * mean a bug in VACUUM if any of the unwanted pages were still dirty. - * (TRUNCATE TABLE also uses it in the same way.) + * number of blocks. For VACUUM, we pass expectDirty = false since it + * could mean a bug in VACUUM if any of the unwanted pages were still + * dirty. (TRUNCATE TABLE also uses it in the same way.) * - * This is also used by RENAME TABLE (with block = 0 and doFlush = true) + * This is also used by RENAME TABLE (with block=0 and expectDirty=true) * to clear out the buffer cache before renaming the physical files of * a relation. Without that, some other backend might try to do a - * blind write of a buffer page (relying on the sb_relname of the buffer) + * blind write of a buffer page (relying on the BlindId of the buffer) * and fail because it's not got the right filename anymore. * * In both cases, the caller should be holding AccessExclusiveLock on * the target relation to ensure that no other backend is busy reading - * more blocks of the relation... + * more blocks of the relation. * - * Returns: 0 - Ok, -1 - DIRTY, -2 - PINNED + * Formerly, we considered it an error condition if we found unexpectedly + * dirty buffers. However, since BufferSync no longer forces out all + * dirty buffers at every xact commit, it's possible for dirty buffers + * to still be present in the cache due to failure of an earlier + * transaction. So, downgrade the error to a mere notice. Maybe we + * shouldn't even emit a notice... + * + * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED * * XXX currently it sequentially searches the buffer pool, should be * changed to more clever ways of searching. * -------------------------------------------------------------------- */ int -FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush) +FlushRelationBuffers(Relation rel, BlockNumber block, bool expectDirty) { int i; BufferDesc *buf; @@ -1642,21 +1841,15 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush) { if (buf->flags & BM_DIRTY) { - if (doFlush) - { - if (FlushBuffer(-i-1, false) != STATUS_OK) - { - elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", - RelationGetRelationName(rel), - block, buf->tag.blockNum); - return -1; - } - } - else - { + if (! expectDirty) elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty", RelationGetRelationName(rel), block, buf->tag.blockNum); + if (FlushBuffer(-i-1, false) != STATUS_OK) + { + elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", + RelationGetRelationName(rel), + block, buf->tag.blockNum); return -1; } } @@ -1676,39 +1869,42 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush) SpinAcquire(BufMgrLock); for (i = 0; i < NBuffers; i++) { + recheck: buf = &BufferDescriptors[i]; - if (buf->tag.relId.dbId == MyDatabaseId && - buf->tag.relId.relId == RelationGetRelid(rel) && + if (buf->tag.relId.relId == RelationGetRelid(rel) && + (buf->tag.relId.dbId == MyDatabaseId || + buf->tag.relId.dbId == (Oid) NULL) && buf->tag.blockNum >= block) { if (buf->flags & BM_DIRTY) { - if (doFlush) - { - SpinRelease(BufMgrLock); - if (FlushBuffer(i+1, false) != STATUS_OK) - { - elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it", - buf->sb_relname, block, buf->tag.blockNum, - PrivateRefCount[i], buf->refcount); - return -1; - } - SpinAcquire(BufMgrLock); - } - else - { - SpinRelease(BufMgrLock); + PinBuffer(buf); + SpinRelease(BufMgrLock); + if (! expectDirty) elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d)", - buf->sb_relname, block, buf->tag.blockNum, + RelationGetRelationName(rel), block, + buf->tag.blockNum, + PrivateRefCount[i], buf->refcount); + if (FlushBuffer(i+1, true) != STATUS_OK) + { + elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it", + RelationGetRelationName(rel), block, + buf->tag.blockNum, PrivateRefCount[i], buf->refcount); return -1; } + SpinAcquire(BufMgrLock); + /* Buffer could already be reassigned, so must recheck + * whether it still belongs to rel before freeing it! + */ + goto recheck; } if (!(buf->flags & BM_FREE)) { SpinRelease(BufMgrLock); elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)", - buf->sb_relname, block, buf->tag.blockNum, + RelationGetRelationName(rel), block, + buf->tag.blockNum, PrivateRefCount[i], buf->refcount); return -2; } @@ -1755,11 +1951,6 @@ ReleaseBuffer(Buffer buffer) AddBufferToFreelist(bufHdr); bufHdr->flags |= BM_FREE; } - if (CommitInfoNeedsSave[buffer - 1]) - { - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - CommitInfoNeedsSave[buffer - 1] = 0; - } SpinRelease(BufMgrLock); } @@ -1777,7 +1968,7 @@ IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -1795,7 +1986,7 @@ ReleaseBuffer_Debug(char *file, int line, Buffer buffer) fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -1822,7 +2013,7 @@ ReleaseAndReadBuffer_Debug(char *file, fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) @@ -1831,7 +2022,7 @@ refcount = %ld, file: %s, line: %d\n", fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - b, buf->sb_relname, buf->tag.blockNum, + b, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[b - 1], file, line); } return b; @@ -1983,11 +2174,43 @@ _bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, #endif /* BMTRACE */ +/* + * SetBufferCommitInfoNeedsSave + * + * Mark a buffer dirty when we have updated tuple commit-status bits in it. + * + * This is similar to WriteNoReleaseBuffer, except that we do not set + * SharedBufferChanged or BufferDirtiedByMe, because we have not made a + * critical change that has to be flushed to disk before xact commit --- the + * status-bit update could be redone by someone else just as easily. The + * buffer will be marked dirty, but it will not be written to disk until + * there is another reason to write it. + * + * This routine might get called many times on the same page, if we are making + * the first scan after commit of an xact that added/deleted many tuples. + * So, be as quick as we can if the buffer is already dirty. + */ void SetBufferCommitInfoNeedsSave(Buffer buffer) { - if (!BufferIsLocal(buffer)) - CommitInfoNeedsSave[buffer - 1]++; + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return; + + if (BAD_BUFFER_ID(buffer)) + return; + + bufHdr = &BufferDescriptors[buffer - 1]; + + if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != + (BM_DIRTY | BM_JUST_DIRTIED)) + { + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + SpinRelease(BufMgrLock); + } } void @@ -2175,7 +2398,16 @@ static void StartBufferIO(BufferDesc *buf, bool forInput) Assert(!(buf->flags & BM_IO_IN_PROGRESS)); buf->flags |= BM_IO_IN_PROGRESS; #ifdef HAS_TEST_AND_SET - Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))) + /* + * There used to be + * + * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + * + * here, but that's wrong because of the way WaitIO works: someone else + * waiting for the I/O to complete will succeed in grabbing the lock for + * a few instructions, and if we context-swap back to here the Assert + * could fail. Tiny window for failure, but I've seen it happen -- tgl + */ S_LOCK(&(buf->io_in_progress_lock)); #endif /* HAS_TEST_AND_SET */ InProgressBuf = buf; @@ -2217,7 +2449,7 @@ static void ContinueBufferIO(BufferDesc *buf, bool forInput) IsForInput = forInput; } -extern void InitBufferIO(void) +void InitBufferIO(void) { InProgressBuf = (BufferDesc *)0; } @@ -2229,7 +2461,7 @@ extern void InitBufferIO(void) * set in case of output,this routine would kill all * backends and reset postmaster. */ -extern void AbortBufferIO(void) +void AbortBufferIO(void) { BufferDesc *buf = InProgressBuf; if (buf) @@ -2252,8 +2484,8 @@ extern void AbortBufferIO(void) buf->flags |= BM_DIRTY; } buf->flags |= BM_IO_ERROR; - TerminateBufferIO(buf); buf->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(buf); SpinRelease(BufMgrLock); } } diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7b927886f5..a77a16c2a0 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.20 2000/01/26 05:56:52 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.21 2000/04/09 04:43:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -122,7 +122,7 @@ PinBuffer_Debug(char *file, int line, BufferDesc *buf) fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -168,7 +168,7 @@ UnpinBuffer_Debug(char *file, int line, BufferDesc *buf) fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -304,7 +304,7 @@ PrintBufferFreeList() int i = (buf - BufferDescriptors); printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n", - i, buf->sb_relname, buf->tag.blockNum, + i, buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i], buf->freeNext, buf->freePrev); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index b11e1d999c..1fb753dbf3 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.54 2000/03/17 02:36:19 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.55 2000/04/09 04:43:19 tgl Exp $ * * NOTES: * @@ -293,7 +293,7 @@ LruDelete(File file) vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR); Assert(vfdP->seekPos != -1); - /* if we have written to the file, sync it */ + /* if we have written to the file, sync it before closing */ if (vfdP->fdstate & FD_DIRTY) { returnValue = pg_fsync(vfdP->fd); @@ -381,9 +381,6 @@ tryAgain: returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET); Assert(returnValue != -1); } - - /* Update state as appropriate for re-open (needed?) */ - vfdP->fdstate &= ~FD_DIRTY; } /* @@ -804,7 +801,7 @@ FileWrite(File file, char *buffer, int amount) if (returnCode > 0) VfdCache[file].seekPos += returnCode; - /* record the write */ + /* mark the file as needing fsync */ VfdCache[file].fdstate |= FD_DIRTY; return returnCode; @@ -873,6 +870,35 @@ FileTruncate(File file, long offset) return returnCode; } +/* + * FileSync --- if a file is marked as dirty, fsync it. + * + * The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to + * write the file, but that we *have* written it and need to execute an + * fsync() to ensure the changes are down on disk before we mark the current + * transaction committed. + * + * FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call. + * It is cleared after successfully fsync'ing the file. FileClose() will + * fsync a dirty File that is about to be closed, since there will be no + * other place to remember the need to fsync after the VFD is gone. + * + * Note that the DIRTY bit is logically associated with the actual disk file, + * not with any particular kernel FD we might have open for it. We assume + * that fsync will force out any dirty buffers for that file, whether or not + * they were written through the FD being used for the fsync call --- they + * might even have been written by some other backend! + * + * Note also that LruDelete currently fsyncs a dirty file that it is about + * to close the kernel file descriptor for. The idea there is to avoid + * having to re-open the kernel descriptor later. But it's not real clear + * that this is a performance win; we could end up fsyncing the same file + * multiple times in a transaction, which would probably cost more time + * than is saved by avoiding an open() call. This should be studied. + * + * This routine used to think it could skip the fsync if the file is + * physically closed, but that is now WRONG; see comments for FileMarkDirty. + */ int FileSync(File file) { @@ -880,23 +906,66 @@ FileSync(File file) Assert(FileIsValid(file)); - /* - * If the file isn't open, then we don't need to sync it; we always - * sync files when we close them. Also, if we haven't done any writes - * that we haven't already synced, we can ignore the request. - */ - - if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY)) - returnCode = 0; - else + if (!(VfdCache[file].fdstate & FD_DIRTY)) { - returnCode = pg_fsync(VfdCache[file].fd); + /* Need not sync if file is not dirty. */ + returnCode = 0; + } + else if (disableFsync) + { + /* Don't force the file open if pg_fsync isn't gonna sync it. */ + returnCode = 0; VfdCache[file].fdstate &= ~FD_DIRTY; } + else + { + /* We don't use FileAccess() because we don't want to force the + * file to the front of the LRU ring; we aren't expecting to + * access it again soon. + */ + if (FileIsNotOpen(file)) + { + returnCode = LruInsert(file); + if (returnCode != 0) + return returnCode; + } + returnCode = pg_fsync(VfdCache[file].fd); + if (returnCode == 0) + VfdCache[file].fdstate &= ~FD_DIRTY; + } return returnCode; } +/* + * FileMarkDirty --- mark a file as needing fsync at transaction commit. + * + * Since FileWrite marks the file dirty, this routine is not needed in + * normal use. It is called when the buffer manager detects that some other + * backend has written out a shared buffer that this backend dirtied (but + * didn't write) in the current xact. In that scenario, we need to fsync + * the file before we can commit. We cannot assume that the other backend + * has fsync'd the file yet; we need to do our own fsync to ensure that + * (a) the disk page is written and (b) this backend's commit is delayed + * until the write is complete. + * + * Note we are assuming that an fsync issued by this backend will write + * kernel disk buffers that were dirtied by another backend. Furthermore, + * it doesn't matter whether we currently have the file physically open; + * we must fsync even if we have to re-open the file to do it. + */ +void +FileMarkDirty(File file) +{ + Assert(FileIsValid(file)); + + DO_DB(elog(DEBUG, "FileMarkDirty: %d (%s)", + file, VfdCache[file].fileName)); + + VfdCache[file].fdstate |= FD_DIRTY; +} + + /* * Routines that want to use stdio (ie, FILE*) should use AllocateFile * rather than plain fopen(). This lets fd.c deal with freeing FDs if @@ -992,6 +1061,12 @@ closeAllVfds() * exit (it doesn't particularly care which). All still-open temporary-file * VFDs are closed, which also causes the underlying files to be deleted. * Furthermore, all "allocated" stdio files are closed. + * + * This routine is not involved in fsync'ing non-temporary files at xact + * commit; that is done by FileSync under control of the buffer manager. + * During a commit, that is done *before* control gets here. If we still + * have any needs-fsync bits set when we get here, we assume this is abort + * and clear them. */ void AtEOXact_Files(void) @@ -1006,6 +1081,8 @@ AtEOXact_Files(void) if ((VfdCache[i].fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL) FileClose(i); + else + VfdCache[i].fdstate &= ~FD_DIRTY; } } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 7382b1867d..233bbb0ac2 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.64 2000/02/07 02:38:18 inoue Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,7 +48,12 @@ typedef struct _MdfdVec { int mdfd_vfd; /* fd number in vfd pool */ - uint16 mdfd_flags; /* clean, dirty, free */ + int mdfd_flags; /* free, temporary */ + +/* these are the assigned bits in mdfd_flags: */ +#define MDFD_FREE (1 << 0)/* unused entry */ +#define MDFD_TEMP (1 << 1)/* close this entry at transaction end */ + int mdfd_lstbcnt; /* most recent block count */ int mdfd_nextFree; /* next free vector */ #ifndef LET_OS_MANAGE_FILESIZE @@ -62,13 +67,13 @@ static int Md_Free = -1; /* head of freelist of unused fdvec entries */ static int CurFd = 0; /* first never-used fdvec index */ static MemoryContext MdCxt; /* context for all my allocations */ -#define MDFD_DIRTY (uint16) 0x01 -#define MDFD_FREE (uint16) 0x02 - /* routines declared here */ +static void mdclose_fd(int fd); static int _mdfd_getrelnfd(Relation reln); static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags); static MdfdVec *_mdfd_getseg(Relation reln, int blkno); +static MdfdVec *_mdfd_blind_getseg(char *dbname, char *relname, + Oid dbid, Oid relid, int blkno); static int _fdvec_alloc(void); static void _fdvec_free(int); static BlockNumber _mdnblocks(File file, Size blcksz); @@ -186,6 +191,8 @@ mdcreate(Relation reln) #endif Md_fdvec[vfd].mdfd_lstbcnt = 0; + pfree(path); + return vfd; } @@ -290,9 +297,6 @@ mdextend(Relation reln, char *buffer) return SM_FAIL; } - /* remember that we did a write, so we can sync at xact commit */ - v->mdfd_flags |= MDFD_DIRTY; - /* try to keep the last block count current, though it's just a hint */ #ifndef LET_OS_MANAGE_FILESIZE if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0) @@ -367,6 +371,8 @@ mdopen(Relation reln) #endif #endif + pfree(path); + return vfd; } @@ -382,13 +388,24 @@ int mdclose(Relation reln) { int fd; - MdfdVec *v; - MemoryContext oldcxt; fd = RelationGetFile(reln); if (fd < 0) return SM_SUCCESS; /* already closed, so no work */ + mdclose_fd(fd); + + reln->rd_fd = -1; + + return SM_SUCCESS; +} + +static void +mdclose_fd(int fd) +{ + MdfdVec *v; + MemoryContext oldcxt; + oldcxt = MemoryContextSwitchTo(MdCxt); #ifndef LET_OS_MANAGE_FILESIZE for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;) @@ -398,17 +415,14 @@ mdclose(Relation reln) /* if not closed already */ if (v->mdfd_vfd >= 0) { - /* * We sync the file descriptor so that we don't need to reopen - * it at transaction commit to force changes to disk. + * it at transaction commit to force changes to disk. (This + * is not really optional, because we are about to forget that + * the file even exists...) */ - FileSync(v->mdfd_vfd); FileClose(v->mdfd_vfd); - - /* mark this file descriptor as clean in our private table */ - v->mdfd_flags &= ~MDFD_DIRTY; } /* Now free vector */ v = v->mdfd_chain; @@ -423,28 +437,20 @@ mdclose(Relation reln) { if (v->mdfd_vfd >= 0) { - /* * We sync the file descriptor so that we don't need to reopen - * it at transaction commit to force changes to disk. + * it at transaction commit to force changes to disk. (This + * is not really optional, because we are about to forget that + * the file even exists...) */ - FileSync(v->mdfd_vfd); FileClose(v->mdfd_vfd); - - /* mark this file descriptor as clean in our private table */ - v->mdfd_flags &= ~MDFD_DIRTY; } } #endif MemoryContextSwitchTo(oldcxt); _fdvec_free(fd); - - /* be sure to mark relation closed */ - reln->rd_fd = -1; - - return SM_SUCCESS; } /* @@ -521,8 +527,6 @@ mdwrite(Relation reln, BlockNumber blocknum, char *buffer) if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) status = SM_FAIL; - v->mdfd_flags |= MDFD_DIRTY; - return status; } @@ -560,14 +564,6 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer) || FileSync(v->mdfd_vfd) < 0) status = SM_FAIL; - /* - * By here, the block is written and changes have been forced to - * stable storage. Mark the descriptor as clean until the next write, - * so we don't sync it again unnecessarily at transaction commit. - */ - - v->mdfd_flags &= ~MDFD_DIRTY; - return status; } @@ -575,141 +571,89 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer) * mdblindwrt() -- Write a block to disk blind. * * We have to be able to do this using only the name and OID of - * the database and relation in which the block belongs. This - * is a synchronous write. + * the database and relation in which the block belongs. Otherwise + * this is just like mdwrite(). */ int -mdblindwrt(char *dbstr, - char *relstr, +mdblindwrt(char *dbname, + char *relname, Oid dbid, Oid relid, BlockNumber blkno, char *buffer) { - int fd; - int segno; - long seekpos; int status; - char *path; + long seekpos; + MdfdVec *v; -#ifndef LET_OS_MANAGE_FILESIZE - int nchars; + v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); - /* be sure we have enough space for the '.segno', if any */ - segno = blkno / RELSEG_SIZE; - if (segno > 0) - nchars = 10; - else - nchars = 0; - - /* construct the path to the file and open it */ - /* system table? then put in system area... */ - if (dbid == (Oid) 0) - { - path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars); - if (segno == 0) - sprintf(path, "%s/%s", DataDir, relstr); - else - sprintf(path, "%s/%s.%d", DataDir, relstr, segno); - } - /* user table? then put in user database area... */ - else if (dbid == MyDatabaseId) - { - path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars); - if (segno == 0) - sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr); - else - sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno); - } - else -/* this is work arround only !!! */ - { - char dbpath[MAXPGPATH]; - Oid id; - char *tmpPath; - - GetRawDatabaseInfo(dbstr, &id, dbpath); - - if (id != dbid) - elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid); - tmpPath = ExpandDatabasePath(dbpath); - if (tmpPath == NULL) - elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr); - path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars); - if (segno == 0) - sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr); - else - sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno); - pfree(tmpPath); - } -#else - /* construct the path to the file and open it */ - /* system table? then put in system area... */ - if (dbid == (Oid) 0) - { - path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2); - sprintf(path, "%s/%s", DataDir, relstr); - } - /* user table? then put in user database area... */ - else if (dbid == MyDatabaseId) - { - path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2); - sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr); - } - else -/* this is work arround only !!! */ - { - char dbpath[MAXPGPATH]; - Oid id; - char *tmpPath; - - GetRawDatabaseInfo(dbstr, &id, dbpath); - - if (id != dbid) - elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid); - tmpPath = ExpandDatabasePath(dbpath); - if (tmpPath == NULL) - elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr); - path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2); - sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr); - pfree(tmpPath); - } -#endif - -#ifndef __CYGWIN32__ - if ((fd = open(path, O_RDWR, 0600)) < 0) -#else - if ((fd = open(path, O_RDWR | O_BINARY, 0600)) < 0) -#endif + if (v == NULL) return SM_FAIL; - /* seek to the right spot */ #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE)); +#ifdef DIAGNOSTIC + if (seekpos >= BLCKSZ * RELSEG_SIZE) + elog(FATAL, "seekpos too big!"); +#endif #else seekpos = (long) (BLCKSZ * (blkno)); #endif - if (lseek(fd, seekpos, SEEK_SET) != seekpos) - { - close(fd); + if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; - } status = SM_SUCCESS; - - /* write and sync the block */ - if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0)) + if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) status = SM_FAIL; - if (close(fd) < 0) - status = SM_FAIL; - - pfree(path); - return status; } +/* + * mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync). + * + * Returns SM_SUCCESS or SM_FAIL. + */ +int +mdmarkdirty(Relation reln, BlockNumber blkno) +{ + MdfdVec *v; + + v = _mdfd_getseg(reln, blkno); + + FileMarkDirty(v->mdfd_vfd); + + return SM_SUCCESS; +} + +/* + * mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync). + * + * We have to be able to do this using only the name and OID of + * the database and relation in which the block belongs. Otherwise + * this is just like mdmarkdirty(). + */ +int +mdblindmarkdirty(char *dbname, + char *relname, + Oid dbid, + Oid relid, + BlockNumber blkno) +{ + MdfdVec *v; + + v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno); + + if (v == NULL) + return SM_FAIL; + + FileMarkDirty(v->mdfd_vfd); + + return SM_SUCCESS; +} + /* * mdnblocks() -- Get the number of blocks stored in a relation. * @@ -873,19 +817,26 @@ mdcommit() for (i = 0; i < CurFd; i++) { -#ifndef LET_OS_MANAGE_FILESIZE - for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) -#else v = &Md_fdvec[i]; - if (v != (MdfdVec *) NULL) -#endif + if (v->mdfd_flags & MDFD_FREE) + continue; + if (v->mdfd_flags & MDFD_TEMP) { - if (v->mdfd_flags & MDFD_DIRTY) + /* Sync and close the file */ + mdclose_fd(i); + } + else + { + /* Sync, but keep the file entry */ + +#ifndef LET_OS_MANAGE_FILESIZE + for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain) +#else + if (v != (MdfdVec *) NULL) +#endif { if (FileSync(v->mdfd_vfd) < 0) return SM_FAIL; - - v->mdfd_flags &= ~MDFD_DIRTY; } } } @@ -908,13 +859,14 @@ mdabort() for (i = 0; i < CurFd; i++) { -#ifndef LET_OS_MANAGE_FILESIZE - for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) - v->mdfd_flags &= ~MDFD_DIRTY; -#else v = &Md_fdvec[i]; - v->mdfd_flags &= ~MDFD_DIRTY; -#endif + if (v->mdfd_flags & MDFD_FREE) + continue; + if (v->mdfd_flags & MDFD_TEMP) + { + /* Close the file */ + mdclose_fd(i); + } } return SM_SUCCESS; @@ -995,7 +947,6 @@ _fdvec_free(int fdvec) Md_fdvec[fdvec].mdfd_nextFree = Md_Free; Md_fdvec[fdvec].mdfd_flags = MDFD_FREE; Md_Free = fdvec; - } static MdfdVec * @@ -1004,19 +955,17 @@ _mdfd_openseg(Relation reln, int segno, int oflags) MemoryContext oldcxt; MdfdVec *v; int fd; - bool dofree; char *path, *fullpath; /* be sure we have enough space for the '.segno', if any */ path = relpath(RelationGetPhysicalRelationName(reln)); - dofree = false; if (segno > 0) { - dofree = true; fullpath = (char *) palloc(strlen(path) + 12); sprintf(fullpath, "%s.%d", path, segno); + pfree(path); } else fullpath = path; @@ -1028,8 +977,7 @@ _mdfd_openseg(Relation reln, int segno, int oflags) fd = FileNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600); #endif - if (dofree) - pfree(fullpath); + pfree(fullpath); if (fd < 0) return (MdfdVec *) NULL; @@ -1109,6 +1057,104 @@ _mdfd_getseg(Relation reln, int blkno) return v; } +/* Find the segment of the relation holding the specified block. + * This is the same as _mdfd_getseg() except that we must work + * "blind" with no Relation struct. + * + * NOTE: we have no easy way to tell whether a FD already exists for the + * target relation, so we always make a new one. This should probably + * be improved somehow, but I doubt it's a significant performance issue + * under normal circumstances. The FD is marked to be closed at end of xact + * so that we don't accumulate a lot of dead FDs. + */ + +static MdfdVec * +_mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid, + int blkno) +{ + MdfdVec *v; + char *path; + int fd; + int vfd; +#ifndef LET_OS_MANAGE_FILESIZE + int segno; + int targsegno; +#endif + + /* construct the path to the file and open it */ + path = relpath_blind(dbname, relname, dbid, relid); + +#ifndef __CYGWIN32__ + fd = FileNameOpenFile(path, O_RDWR, 0600); +#else + fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600); +#endif + + if (fd < 0) + return NULL; + + vfd = _fdvec_alloc(); + if (vfd < 0) + return NULL; + + Md_fdvec[vfd].mdfd_vfd = fd; + Md_fdvec[vfd].mdfd_flags = MDFD_TEMP; + Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); +#ifndef LET_OS_MANAGE_FILESIZE + Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL; + +#ifdef DIAGNOSTIC + if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE) + elog(FATAL, "segment too big on relopen!"); +#endif + + targsegno = blkno / RELSEG_SIZE; + for (v = &Md_fdvec[vfd], segno = 1; segno <= targsegno; segno++) + { + char *segpath; + MdfdVec *newv; + MemoryContext oldcxt; + + segpath = (char *) palloc(strlen(path) + 12); + sprintf(segpath, "%s.%d", path, segno); + +#ifndef __CYGWIN32__ + fd = FileNameOpenFile(segpath, O_RDWR | O_CREAT, 0600); +#else + fd = FileNameOpenFile(segpath, O_RDWR | O_BINARY | O_CREAT, 0600); +#endif + + pfree(segpath); + + if (fd < 0) + return (MdfdVec *) NULL; + + /* allocate an mdfdvec entry for it */ + oldcxt = MemoryContextSwitchTo(MdCxt); + newv = (MdfdVec *) palloc(sizeof(MdfdVec)); + MemoryContextSwitchTo(oldcxt); + + /* fill the entry */ + newv->mdfd_vfd = fd; + newv->mdfd_flags = MDFD_TEMP; + newv->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); + newv->mdfd_chain = (MdfdVec *) NULL; +#ifdef DIAGNOSTIC + if (newv->mdfd_lstbcnt > RELSEG_SIZE) + elog(FATAL, "segment too big on open!"); +#endif + v->mdfd_chain = newv; + v = newv; + } +#else + v = &Md_fdvec[vfd]; +#endif + + pfree(path); + + return v; +} + static BlockNumber _mdnblocks(File file, Size blcksz) { diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index f0c20f8219..839636b118 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.32 2000/01/26 05:57:05 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.33 2000/04/09 04:43:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -23,21 +23,30 @@ static void smgrshutdown(int dummy); typedef struct f_smgr { - int (*smgr_init) ();/* may be NULL */ - int (*smgr_shutdown) (); /* may be NULL */ - int (*smgr_create) (); - int (*smgr_unlink) (); - int (*smgr_extend) (); - int (*smgr_open) (); - int (*smgr_close) (); - int (*smgr_read) (); - int (*smgr_write) (); - int (*smgr_flush) (); - int (*smgr_blindwrt) (); - int (*smgr_nblocks) (); - int (*smgr_truncate) (); - int (*smgr_commit) (); /* may be NULL */ - int (*smgr_abort) (); /* may be NULL */ + int (*smgr_init) (void); /* may be NULL */ + int (*smgr_shutdown) (void); /* may be NULL */ + int (*smgr_create) (Relation reln); + int (*smgr_unlink) (Relation reln); + int (*smgr_extend) (Relation reln, char *buffer); + int (*smgr_open) (Relation reln); + int (*smgr_close) (Relation reln); + int (*smgr_read) (Relation reln, BlockNumber blocknum, + char *buffer); + int (*smgr_write) (Relation reln, BlockNumber blocknum, + char *buffer); + int (*smgr_flush) (Relation reln, BlockNumber blocknum, + char *buffer); + int (*smgr_blindwrt) (char *dbname, char *relname, + Oid dbid, Oid relid, + BlockNumber blkno, char *buffer); + int (*smgr_markdirty) (Relation reln, BlockNumber blkno); + int (*smgr_blindmarkdirty) (char *dbname, char *relname, + Oid dbid, Oid relid, + BlockNumber blkno); + int (*smgr_nblocks) (Relation reln); + int (*smgr_truncate) (Relation reln, int nblocks); + int (*smgr_commit) (void); /* may be NULL */ + int (*smgr_abort) (void); /* may be NULL */ } f_smgr; /* @@ -49,14 +58,14 @@ static f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose, - mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdtruncate, - mdcommit, mdabort}, + mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty, + mdnblocks, mdtruncate, mdcommit, mdabort}, #ifdef STABLE_MEMORY_STORAGE /* main memory */ {mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose, - mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, NULL, - mmcommit, mmabort}, + mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty, + mmnblocks, NULL, mmcommit, mmabort}, #endif }; @@ -299,6 +308,7 @@ smgrblindwrt(int16 which, char *relstr; int status; + /* strdup here is probably redundant */ dbstr = pstrdup(dbname); relstr = pstrdup(relname); @@ -315,6 +325,67 @@ smgrblindwrt(int16 which, return status; } +/* + * smgrmarkdirty() -- Mark a page dirty (needs fsync). + * + * Mark the specified page as needing to be fsync'd before commit. + * Ordinarily, the storage manager will do this implicitly during + * smgrwrite(). However, the buffer manager may discover that some + * other backend has written a buffer that we dirtied in the current + * transaction. In that case, we still need to fsync the file to be + * sure the page is down to disk before we commit. + */ +int +smgrmarkdirty(int16 which, + Relation reln, + BlockNumber blkno) +{ + int status; + + status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno); + + if (status == SM_FAIL) + elog(ERROR, "cannot mark block %d of %s", + blkno, RelationGetRelationName(reln)); + + return status; +} + +/* + * smgrblindmarkdirty() -- Mark a page dirty, "blind". + * + * Just like smgrmarkdirty, except we don't have a reldesc. + */ +int +smgrblindmarkdirty(int16 which, + char *dbname, + char *relname, + Oid dbid, + Oid relid, + BlockNumber blkno) +{ + char *dbstr; + char *relstr; + int status; + + /* strdup here is probably redundant */ + dbstr = pstrdup(dbname); + relstr = pstrdup(relname); + + status = (*(smgrsw[which].smgr_blindmarkdirty)) (dbstr, relstr, + dbid, relid, + blkno); + + if (status == SM_FAIL) + elog(ERROR, "cannot mark block %d of %s [%s] blind", + blkno, relstr, dbstr); + + pfree(dbstr); + pfree(relstr); + + return status; +} + /* * smgrnblocks() -- Calculate the number of POSTGRES blocks in the * supplied relation. @@ -378,7 +449,6 @@ smgrcommit() return SM_SUCCESS; } -#ifdef NOT_USED int smgrabort() { @@ -396,8 +466,6 @@ smgrabort() return SM_SUCCESS; } -#endif - #ifdef NOT_USED bool smgriswo(int16 smgrno) diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index 1d99fe567c..8eaa1a9c8b 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: catalog.h,v 1.10 2000/01/26 05:57:56 momjian Exp $ + * $Id: catalog.h,v 1.11 2000/04/09 04:43:14 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -17,6 +17,8 @@ #include "access/tupdesc.h" extern char *relpath(const char *relname); +extern char *relpath_blind(const char *dbname, const char *relname, + Oid dbid, Oid relid); extern bool IsSystemRelationName(const char *relname); extern bool IsSharedSystemRelationName(const char *relname); extern Oid newoid(void); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index f18322f169..0c5b6d64c0 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: buf_internals.h,v 1.35 2000/01/26 05:58:32 momjian Exp $ + * $Id: buf_internals.h,v 1.36 2000/04/09 04:43:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -61,6 +61,16 @@ typedef struct buftag (a)->relId = (xx_reln)->rd_lockInfo.lockRelId \ ) +/* If we have to write a buffer "blind" (without a relcache entry), + * the BufferTag is not enough information. BufferBlindId carries the + * additional information needed. + */ +typedef struct bufblindid +{ + char dbname[NAMEDATALEN]; /* name of db in which buf belongs */ + char relname[NAMEDATALEN]; /* name of reln */ +} BufferBlindId; + #define BAD_BUFFER_ID(bid) ((bid) < 1 || (bid) > NBuffers) #define INVALID_DESCRIPTOR (-3) @@ -98,8 +108,7 @@ typedef struct sbufdesc bool ri_lock; /* read-intent lock */ bool w_lock; /* context exclusively locked */ - char sb_dbname[NAMEDATALEN]; /* name of db in which buf belongs */ - char sb_relname[NAMEDATALEN]; /* name of reln */ + BufferBlindId blind; /* extra info to support blind write */ } BufferDesc; /* @@ -164,7 +173,9 @@ extern BufferDesc *BufferDescriptors; extern BufferBlock BufferBlocks; extern long *PrivateRefCount; extern bits8 *BufferLocks; -extern long *CommitInfoNeedsSave; +extern BufferTag *BufferTagLastDirtied; +extern BufferBlindId *BufferBlindLastDirtied; +extern bool *BufferDirtiedByMe; extern SPINLOCK BufMgrLock; /* localbuf.c */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 9c83e04e2e..e852dfd052 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: bufmgr.h,v 1.35 2000/03/31 02:43:30 tgl Exp $ + * $Id: bufmgr.h,v 1.36 2000/04/09 04:43:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -164,7 +164,7 @@ extern int FlushBuffer(Buffer buffer, bool release); extern void InitBufferPool(IPCKey key); extern void PrintBufferUsage(FILE *statfp); extern void ResetBufferUsage(void); -extern void ResetBufferPool(void); +extern void ResetBufferPool(bool isCommit); extern int BufferPoolCheckLeak(void); extern void FlushBufferPool(void); extern BlockNumber BufferGetBlockNumber(Buffer buffer); diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 74fdb94fa0..f82fc14cae 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: fd.h,v 1.19 2000/01/26 05:58:32 momjian Exp $ + * $Id: fd.h,v 1.20 2000/04/09 04:43:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -15,7 +15,7 @@ /* * calls: * - * File {Close, Read, Write, Seek, Tell, Sync} + * File {Close, Read, Write, Seek, Tell, MarkDirty, Sync} * {File Name Open, Allocate, Free} File * * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES. @@ -58,6 +58,7 @@ extern int FileWrite(File file, char *buffer, int amount); extern long FileSeek(File file, long offset, int whence); extern int FileTruncate(File file, long offset); extern int FileSync(File file); +extern void FileMarkDirty(File file); /* Operations that allow use of regular stdio --- USE WITH CAUTION */ extern FILE *AllocateFile(char *name, char *mode); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 2ef2467a09..053a63196e 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: smgr.h,v 1.17 2000/01/26 05:58:33 momjian Exp $ + * $Id: smgr.h,v 1.18 2000/04/09 04:43:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -29,17 +29,23 @@ extern int smgrunlink(int16 which, Relation reln); extern int smgrextend(int16 which, Relation reln, char *buffer); extern int smgropen(int16 which, Relation reln); extern int smgrclose(int16 which, Relation reln); -extern int smgrread(int16 which, Relation reln, BlockNumber blocknum, - char *buffer); -extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum, - char *buffer); -extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum, - char *buffer); -extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid, - Oid relid, BlockNumber blkno, char *buffer); +extern int smgrread(int16 which, Relation reln, BlockNumber blocknum, + char *buffer); +extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum, + char *buffer); +extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum, + char *buffer); +extern int smgrblindwrt(int16 which, char *dbname, char *relname, + Oid dbid, Oid relid, + BlockNumber blkno, char *buffer); +extern int smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno); +extern int smgrblindmarkdirty(int16 which, char *dbname, char *relname, + Oid dbid, Oid relid, + BlockNumber blkno); extern int smgrnblocks(int16 which, Relation reln); extern int smgrtruncate(int16 which, Relation reln, int nblocks); extern int smgrcommit(void); +extern int smgrabort(void); @@ -55,8 +61,11 @@ extern int mdclose(Relation reln); extern int mdread(Relation reln, BlockNumber blocknum, char *buffer); extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer); extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer); -extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); +extern int mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid, + BlockNumber blkno, char *buffer); +extern int mdmarkdirty(Relation reln, BlockNumber blkno); +extern int mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid, + BlockNumber blkno); extern int mdnblocks(Relation reln); extern int mdtruncate(Relation reln, int nblocks); extern int mdcommit(void); @@ -66,7 +75,6 @@ extern int mdabort(void); extern SPINLOCK MMCacheLock; extern int mminit(void); -extern int mmshutdown(void); extern int mmcreate(Relation reln); extern int mmunlink(Relation reln); extern int mmextend(Relation reln, char *buffer); @@ -75,11 +83,17 @@ extern int mmclose(Relation reln); extern int mmread(Relation reln, BlockNumber blocknum, char *buffer); extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer); extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer); -extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid, - BlockNumber blkno, char *buffer); +extern int mmblindwrt(char *dbname, char *relname, Oid dbid, Oid relid, + BlockNumber blkno, char *buffer); +extern int mmmarkdirty(Relation reln, BlockNumber blkno); +extern int mmblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid, + BlockNumber blkno); extern int mmnblocks(Relation reln); +extern int mmtruncate(Relation reln, int nblocks); extern int mmcommit(void); extern int mmabort(void); + +extern int mmshutdown(void); extern int MMShmemSize(void); /* smgrtype.c */