diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile index d2198f2b93..ca6a0e4f7d 100644 --- a/src/backend/storage/file/Makefile +++ b/src/backend/storage/file/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/file top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = fd.o buffile.o copydir.o reinit.o +OBJS = fd.o buffile.o copydir.o reinit.o sharedfileset.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 06bf2fadbf..fa9940da9b 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -31,12 +31,18 @@ * BufFile also supports temporary files that exceed the OS file size limit * (by opening multiple fd.c temporary files). This is an essential feature * for sorts and hashjoins on large amounts of data. + * + * BufFile supports temporary files that can be made read-only and shared with + * other backends, as infrastructure for parallel execution. Such files need + * to be created as a member of a SharedFileSet that all participants are + * attached to. *------------------------------------------------------------------------- */ #include "postgres.h" #include "executor/instrument.h" +#include "miscadmin.h" #include "pgstat.h" #include "storage/fd.h" #include "storage/buffile.h" @@ -70,6 +76,10 @@ struct BufFile bool isInterXact; /* keep open over transactions? */ bool dirty; /* does buffer need to be written? */ + bool readOnly; /* has the file been set to read only? */ + + SharedFileSet *fileset; /* space for segment files if shared */ + const char *name; /* name of this BufFile if shared */ /* * resowner is the ResourceOwner to use for underlying temp files. (We @@ -94,6 +104,7 @@ static void extendBufFile(BufFile *file); static void BufFileLoadBuffer(BufFile *file); static void BufFileDumpBuffer(BufFile *file); static int BufFileFlush(BufFile *file); +static File MakeNewSharedSegment(BufFile *file, int segment); /* @@ -117,6 +128,9 @@ makeBufFile(File firstfile) file->curOffset = 0L; file->pos = 0; file->nbytes = 0; + file->readOnly = false; + file->fileset = NULL; + file->name = NULL; return file; } @@ -134,7 +148,11 @@ extendBufFile(BufFile *file) oldowner = CurrentResourceOwner; CurrentResourceOwner = file->resowner; - pfile = OpenTemporaryFile(file->isInterXact); + if (file->fileset == NULL) + pfile = OpenTemporaryFile(file->isInterXact); + else + pfile = MakeNewSharedSegment(file, file->numFiles); + Assert(pfile >= 0); CurrentResourceOwner = oldowner; @@ -175,6 +193,189 @@ BufFileCreateTemp(bool interXact) return file; } +/* + * Build the name for a given segment of a given BufFile. + */ +static void +SharedSegmentName(char *name, const char *buffile_name, int segment) +{ + snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment); +} + +/* + * Create a new segment file backing a shared BufFile. + */ +static File +MakeNewSharedSegment(BufFile *buffile, int segment) +{ + char name[MAXPGPATH]; + File file; + + SharedSegmentName(name, buffile->name, segment); + file = SharedFileSetCreate(buffile->fileset, name); + + /* SharedFileSetCreate would've errored out */ + Assert(file > 0); + + return file; +} + +/* + * Create a BufFile that can be discovered and opened read-only by other + * backends that are attached to the same SharedFileSet using the same name. + * + * The naming scheme for shared BufFiles is left up to the calling code. The + * name will appear as part of one or more filenames on disk, and might + * provide clues to administrators about which subsystem is generating + * temporary file data. Since each SharedFileSet object is backed by one or + * more uniquely named temporary directory, names don't conflict with + * unrelated SharedFileSet objects. + */ +BufFile * +BufFileCreateShared(SharedFileSet *fileset, const char *name) +{ + BufFile *file; + + file = (BufFile *) palloc(sizeof(BufFile)); + file->fileset = fileset; + file->name = pstrdup(name); + file->numFiles = 1; + file->files = (File *) palloc(sizeof(File)); + file->files[0] = MakeNewSharedSegment(file, 0); + file->offsets = (off_t *) palloc(sizeof(off_t)); + file->offsets[0] = 0L; + file->isInterXact = false; + file->dirty = false; + file->resowner = CurrentResourceOwner; + file->curFile = 0; + file->curOffset = 0L; + file->pos = 0; + file->nbytes = 0; + file->readOnly = false; + file->name = pstrdup(name); + + return file; +} + +/* + * Open a file that was previously created in another backend (or this one) + * with BufFileCreateShared in the same SharedFileSet using the same name. + * The backend that created the file must have called BufFileClose() or + * BufFileExport() to make sure that it is ready to be opened by other + * backends and render it read-only. + */ +BufFile * +BufFileOpenShared(SharedFileSet *fileset, const char *name) +{ + BufFile *file = (BufFile *) palloc(sizeof(BufFile)); + char segment_name[MAXPGPATH]; + Size capacity = 16; + File *files = palloc(sizeof(File) * capacity); + int nfiles = 0; + + file = (BufFile *) palloc(sizeof(BufFile)); + files = palloc(sizeof(File) * capacity); + + /* + * We don't know how many segments there are, so we'll probe the + * filesystem to find out. + */ + for (;;) + { + /* See if we need to expand our file segment array. */ + if (nfiles + 1 > capacity) + { + capacity *= 2; + files = repalloc(files, sizeof(File) * capacity); + } + /* Try to load a segment. */ + SharedSegmentName(segment_name, name, nfiles); + files[nfiles] = SharedFileSetOpen(fileset, segment_name); + if (files[nfiles] <= 0) + break; + ++nfiles; + + CHECK_FOR_INTERRUPTS(); + } + + /* + * If we didn't find any files at all, then no BufFile exists with this + * name. + */ + if (nfiles == 0) + return NULL; + + file->numFiles = nfiles; + file->files = files; + file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles); + file->isInterXact = false; + file->dirty = false; + file->resowner = CurrentResourceOwner; /* Unused, can't extend */ + file->curFile = 0; + file->curOffset = 0L; + file->pos = 0; + file->nbytes = 0; + file->readOnly = true; /* Can't write to files opened this way */ + file->fileset = fileset; + file->name = pstrdup(name); + + return file; +} + +/* + * Delete a BufFile that was created by BufFileCreateShared in the given + * SharedFileSet using the given name. + * + * It is not necessary to delete files explicitly with this function. It is + * provided only as a way to delete files proactively, rather than waiting for + * the SharedFileSet to be cleaned up. + * + * Only one backend should attempt to delete a given name, and should know + * that it exists and has been exported or closed. + */ +void +BufFileDeleteShared(SharedFileSet *fileset, const char *name) +{ + char segment_name[MAXPGPATH]; + int segment = 0; + bool found = false; + + /* + * We don't know how many segments the file has. We'll keep deleting + * until we run out. If we don't manage to find even an initial segment, + * raise an error. + */ + for (;;) + { + SharedSegmentName(segment_name, name, segment); + if (!SharedFileSetDelete(fileset, segment_name, true)) + break; + found = true; + ++segment; + + CHECK_FOR_INTERRUPTS(); + } + + if (!found) + elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name); +} + +/* + * BufFileExportShared --- flush and make read-only, in preparation for sharing. + */ +void +BufFileExportShared(BufFile *file) +{ + /* Must be a file belonging to a SharedFileSet. */ + Assert(file->fileset != NULL); + + /* It's probably a bug if someone calls this twice. */ + Assert(!file->readOnly); + + BufFileFlush(file); + file->readOnly = true; +} + /* * Close a BufFile * @@ -390,6 +591,8 @@ BufFileWrite(BufFile *file, void *ptr, size_t size) size_t nwritten = 0; size_t nthistime; + Assert(!file->readOnly); + while (size > 0) { if (file->pos >= BLCKSZ) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index aa2fe2c6c0..2e93e4ad63 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -39,6 +39,14 @@ * for a long time, like relation files. It is the caller's responsibility * to close them, there is no automatic mechanism in fd.c for that. * + * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage + * temporary files that have names so that they can be shared between + * backends. Such files are automatically closed and count against the + * temporary file limit of the backend that creates them, but unlike anonymous + * files they are not automatically deleted. See sharedfileset.c for a shared + * ownership mechanism that provides automatic cleanup for shared files when + * the last of a group of backends detaches. + * * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively. * They behave like the corresponding native functions, except that the handle @@ -175,8 +183,9 @@ int max_safe_fds = 32; /* default if not changed */ #define FilePosIsUnknown(pos) ((pos) < 0) /* these are the assigned bits in fdstate below: */ -#define FD_TEMPORARY (1 << 0) /* T = delete when closed */ -#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */ +#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */ +#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */ +#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */ typedef struct vfd { @@ -313,7 +322,7 @@ static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel) static void AtProcExit_Files(int code, Datum arg); static void CleanupTempFiles(bool isProcExit); -static void RemovePgTempFilesInDir(const char *tmpdirname); +static void RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all); static void RemovePgTempRelationFiles(const char *tsdirname); static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); static bool looks_like_temp_rel_name(const char *name); @@ -326,6 +335,7 @@ static void walkdir(const char *path, static void pre_sync_fname(const char *fname, bool isdir, int elevel); #endif static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); +static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); static int fsync_parent_path(const char *fname, int elevel); @@ -1294,6 +1304,39 @@ FileAccess(File file) return 0; } +/* + * Called whenever a temporary file is deleted to report its size. + */ +static void +ReportTemporaryFileUsage(const char *path, off_t size) +{ + pgstat_report_tempfile(size); + + if (log_temp_files >= 0) + { + if ((size / 1024) >= log_temp_files) + ereport(LOG, + (errmsg("temporary file: path \"%s\", size %lu", + path, (unsigned long) size))); + } +} + +/* + * Called to register a temporary file for automatic close. + * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called + * before the file was opened. + */ +static void +RegisterTemporaryFile(File file) +{ + ResourceOwnerRememberFile(CurrentResourceOwner, file); + VfdCache[file].resowner = CurrentResourceOwner; + + /* Backup mechanism for closing at end of xact. */ + VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT; + have_xact_temporary_files = true; +} + /* * Called when we get a shared invalidation message on some relation. */ @@ -1378,6 +1421,67 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) return file; } +/* + * Create directory 'directory'. If necessary, create 'basedir', which must + * be the directory above it. This is designed for creating the top-level + * temporary directory on demand before creating a directory underneath it. + * Do nothing if the directory already exists. + * + * Directories created within the top-level temporary directory should begin + * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and + * deleted at startup by RemovePgTempFiles(). Further subdirectories below + * that do not need any particular prefix. +*/ +void +PathNameCreateTemporaryDir(const char *basedir, const char *directory) +{ + if (mkdir(directory, S_IRWXU) < 0) + { + if (errno == EEXIST) + return; + + /* + * Failed. Try to create basedir first in case it's missing. Tolerate + * EEXIST to close a race against another process following the same + * algorithm. + */ + if (mkdir(basedir, S_IRWXU) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary directory \"%s\": %m", + basedir))); + + /* Try again. */ + if (mkdir(directory, S_IRWXU) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary subdirectory \"%s\": %m", + directory))); + } +} + +/* + * Delete a directory and everything in it, if it exists. + */ +void +PathNameDeleteTemporaryDir(const char *dirname) +{ + struct stat statbuf; + + /* Silently ignore missing directory. */ + if (stat(dirname, &statbuf) != 0 && errno == ENOENT) + return; + + /* + * Currently, walkdir doesn't offer a way for our passed in function to + * maintain state. Perhaps it should, so that we could tell the caller + * whether this operation succeeded or failed. Since this operation is + * used in a cleanup path, we wouldn't actually behave differently: we'll + * just log failures. + */ + walkdir(dirname, unlink_if_exists_fname, false, LOG); +} + /* * Open a temporary file that will disappear when we close it. * @@ -1432,24 +1536,40 @@ OpenTemporaryFile(bool interXact) DEFAULTTABLESPACE_OID, true); - /* Mark it for deletion at close */ - VfdCache[file].fdstate |= FD_TEMPORARY; + /* Mark it for deletion at close and temporary file size limit */ + VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT; /* Register it with the current resource owner */ if (!interXact) - { - VfdCache[file].fdstate |= FD_XACT_TEMPORARY; - - VfdCache[file].resowner = CurrentResourceOwner; - ResourceOwnerRememberFile(CurrentResourceOwner, file); - - /* ensure cleanup happens at eoxact */ - have_xact_temporary_files = true; - } + RegisterTemporaryFile(file); return file; } +/* + * Return the path of the temp directory in a given tablespace. + */ +void +TempTablespacePath(char *path, Oid tablespace) +{ + /* + * Identify the tempfile directory for this tablespace. + * + * If someone tries to specify pg_global, use pg_default instead. + */ + if (tablespace == InvalidOid || + tablespace == DEFAULTTABLESPACE_OID || + tablespace == GLOBALTABLESPACE_OID) + snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR); + else + { + /* All other tablespaces are accessed via symlinks */ + snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s", + tablespace, TABLESPACE_VERSION_DIRECTORY, + PG_TEMP_FILES_DIR); + } +} + /* * Open a temporary file in a specific tablespace. * Subroutine for OpenTemporaryFile, which see for details. @@ -1461,24 +1581,7 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) char tempfilepath[MAXPGPATH]; File file; - /* - * Identify the tempfile directory for this tablespace. - * - * If someone tries to specify pg_global, use pg_default instead. - */ - if (tblspcOid == DEFAULTTABLESPACE_OID || - tblspcOid == GLOBALTABLESPACE_OID) - { - /* The default tablespace is {datadir}/base */ - snprintf(tempdirpath, sizeof(tempdirpath), "base/%s", - PG_TEMP_FILES_DIR); - } - else - { - /* All other tablespaces are accessed via symlinks */ - snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s", - tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); - } + TempTablespacePath(tempdirpath, tblspcOid); /* * Generate a tempfile name that should be unique within the current @@ -1515,6 +1618,130 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) return file; } + +/* + * Create a new file. The directory containing it must already exist. Files + * created this way are subject to temp_file_limit and are automatically + * closed at end of transaction, but are not automatically deleted on close + * because they are intended to be shared between cooperating backends. + * + * If the file is inside the top-level temporary directory, its name should + * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary + * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be + * inside a directory created with PathnameCreateTemporaryDir(), in which case + * the prefix isn't needed. + */ +File +PathNameCreateTemporaryFile(const char *path, bool error_on_failure) +{ + File file; + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* + * Open the file. Note: we don't use O_EXCL, in case there is an orphaned + * temp file that can be reused. + */ + file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0) + { + if (error_on_failure) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create temporary file \"%s\": %m", + path))); + else + return file; + } + + /* Mark it for temp_file_limit accounting. */ + VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT; + + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + + return file; +} + +/* + * Open a file that was created with PathNameCreateTemporaryFile, possibly in + * another backend. Files opened this way don't count against the + * temp_file_limit of the caller, are read-only and are automatically closed + * at the end of the transaction but are not deleted on close. + */ +File +PathNameOpenTemporaryFile(const char *path) +{ + File file; + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* We open the file read-only. */ + file = PathNameOpenFile(path, O_RDONLY | PG_BINARY); + + /* If no such file, then we don't raise an error. */ + if (file <= 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open temporary file \"%s\": %m", + path))); + + if (file > 0) + { + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + } + + return file; +} + +/* + * Delete a file by pathname. Return true if the file existed, false if + * didn't. + */ +bool +PathNameDeleteTemporaryFile(const char *path, bool error_on_failure) +{ + struct stat filestats; + int stat_errno; + + /* Get the final size for pgstat reporting. */ + if (stat(path, &filestats) != 0) + stat_errno = errno; + else + stat_errno = 0; + + /* + * Unlike FileClose's automatic file deletion code, we tolerate + * non-existence to support BufFileDeleteShared which doesn't know how + * many segments it has to delete until it runs out. + */ + if (stat_errno == ENOENT) + return false; + + if (unlink(path) < 0) + { + if (errno != ENOENT) + ereport(error_on_failure ? ERROR : LOG, + (errcode_for_file_access(), + errmsg("cannot unlink temporary file \"%s\": %m", + path))); + return false; + } + + if (stat_errno == 0) + ReportTemporaryFileUsage(path, filestats.st_size); + else + { + errno = stat_errno; + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + } + + return true; +} + /* * close a file when done with it */ @@ -1543,10 +1770,17 @@ FileClose(File file) Delete(file); } + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) + { + /* Subtract its size from current usage (do first in case of error) */ + temporary_files_size -= vfdP->fileSize; + vfdP->fileSize = 0; + } + /* * Delete the file if it was temporary, and make a log entry if wanted */ - if (vfdP->fdstate & FD_TEMPORARY) + if (vfdP->fdstate & FD_DELETE_AT_CLOSE) { struct stat filestats; int stat_errno; @@ -1558,11 +1792,8 @@ FileClose(File file) * is arranged to ensure that the worst-case consequence is failing to * emit log message(s), not failing to attempt the unlink. */ - vfdP->fdstate &= ~FD_TEMPORARY; + vfdP->fdstate &= ~FD_DELETE_AT_CLOSE; - /* Subtract its size from current usage (do first in case of error) */ - temporary_files_size -= vfdP->fileSize; - vfdP->fileSize = 0; /* first try the stat() */ if (stat(vfdP->fileName, &filestats)) @@ -1576,18 +1807,7 @@ FileClose(File file) /* and last report the stat results */ if (stat_errno == 0) - { - pgstat_report_tempfile(filestats.st_size); - - if (log_temp_files >= 0) - { - if ((filestats.st_size / 1024) >= log_temp_files) - ereport(LOG, - (errmsg("temporary file: path \"%s\", size %lu", - vfdP->fileName, - (unsigned long) filestats.st_size))); - } - } + ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size); else { errno = stat_errno; @@ -1761,7 +1981,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info) * message if we do that. All current callers would just throw error * immediately anyway, so this is safe at present. */ - if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY)) + if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) { off_t newPos; @@ -1814,7 +2034,7 @@ retry: * get here in that state if we're not enforcing temporary_files_size, * so we don't care. */ - if (vfdP->fdstate & FD_TEMPORARY) + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) { off_t newPos = vfdP->seekPos; @@ -1985,7 +2205,7 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info) if (returnCode == 0 && VfdCache[file].fileSize > offset) { /* adjust our state for truncation of a temp file */ - Assert(VfdCache[file].fdstate & FD_TEMPORARY); + Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); temporary_files_size -= VfdCache[file].fileSize - offset; VfdCache[file].fileSize = offset; } @@ -2593,6 +2813,24 @@ TempTablespacesAreSet(void) return (numTempTableSpaces >= 0); } +/* + * GetTempTablespaces + * + * Populate an array with the OIDs of the tablespaces that should be used for + * temporary files. Return the number that were copied into the output array. + */ +int +GetTempTablespaces(Oid *tableSpaces, int numSpaces) +{ + int i; + + Assert(TempTablespacesAreSet()); + for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i) + tableSpaces[i] = tempTableSpaces[i]; + + return i; +} + /* * GetNextTempTableSpace * @@ -2696,7 +2934,8 @@ CleanupTempFiles(bool isProcExit) { unsigned short fdstate = VfdCache[i].fdstate; - if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL) + if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) && + VfdCache[i].fileName != NULL) { /* * If we're in the process of exiting a backend process, close @@ -2707,7 +2946,7 @@ CleanupTempFiles(bool isProcExit) */ if (isProcExit) FileClose(i); - else if (fdstate & FD_XACT_TEMPORARY) + else if (fdstate & FD_CLOSE_AT_EOXACT) { elog(WARNING, "temporary file %s not closed at end-of-transaction", @@ -2751,7 +2990,7 @@ RemovePgTempFiles(void) * First process temp files in pg_default ($PGDATA/base) */ snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR); - RemovePgTempFilesInDir(temp_path); + RemovePgTempFilesInDir(temp_path, false); RemovePgTempRelationFiles("base"); /* @@ -2767,7 +3006,7 @@ RemovePgTempFiles(void) snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); - RemovePgTempFilesInDir(temp_path); + RemovePgTempFilesInDir(temp_path, false); snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); @@ -2785,9 +3024,15 @@ RemovePgTempFiles(void) #endif } -/* Process one pgsql_tmp directory for RemovePgTempFiles */ +/* + * Process one pgsql_tmp directory for RemovePgTempFiles. At the top level in + * each tablespace, this should be called with unlink_all = false, so that + * only files matching the temporary name prefix will be unlinked. When + * recursing it will be called with unlink_all = true to unlink everything + * under a top-level temporary directory. + */ static void -RemovePgTempFilesInDir(const char *tmpdirname) +RemovePgTempFilesInDir(const char *tmpdirname, bool unlink_all) { DIR *temp_dir; struct dirent *temp_de; @@ -2813,10 +3058,25 @@ RemovePgTempFilesInDir(const char *tmpdirname) snprintf(rm_path, sizeof(rm_path), "%s/%s", tmpdirname, temp_de->d_name); - if (strncmp(temp_de->d_name, + if (unlink_all || + strncmp(temp_de->d_name, PG_TEMP_FILE_PREFIX, strlen(PG_TEMP_FILE_PREFIX)) == 0) - unlink(rm_path); /* note we ignore any error */ + { + struct stat statbuf; + + /* note that we ignore any error here and below */ + if (lstat(rm_path, &statbuf) < 0) + continue; + + if (S_ISDIR(statbuf.st_mode)) + { + RemovePgTempFilesInDir(rm_path, true); + rmdir(rm_path); + } + else + unlink(rm_path); + } else elog(LOG, "unexpected file found in temporary-files directory: \"%s\"", @@ -3152,6 +3412,23 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel) fsync_fname_ext(fname, isdir, true, elevel); } +static void +unlink_if_exists_fname(const char *fname, bool isdir, int elevel) +{ + if (isdir) + { + if (rmdir(fname) != 0 && errno != ENOENT) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rmdir directory \"%s\": %m", fname))); + } + else + { + /* Use PathNameDeleteTemporaryFile to report filesize */ + PathNameDeleteTemporaryFile(fname, false); + } +} + /* * fsync_fname_ext -- Try to fsync a file or directory * diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c new file mode 100644 index 0000000000..343b2283f0 --- /dev/null +++ b/src/backend/storage/file/sharedfileset.c @@ -0,0 +1,244 @@ +/*------------------------------------------------------------------------- + * + * sharedfileset.c + * Shared temporary file management. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/sharedfileset.c + * + * SharefFileSets provide a temporary namespace (think directory) so that + * files can be discovered by name, and a shared ownership semantics so that + * shared files survive until the last user detaches. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/hash.h" +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "miscadmin.h" +#include "storage/dsm.h" +#include "storage/sharedfileset.h" +#include "utils/builtins.h" + +static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum); +static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace); +static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name); +static Oid ChooseTablespace(const SharedFileSet *fileset, const char *name); + +/* + * Initialize a space for temporary files that can be opened for read-only + * access by other backends. Other backends must attach to it before + * accessing it. Associate this SharedFileSet with 'seg'. Any contained + * files will be deleted when the last backend detaches. + * + * Files will be distributed over the tablespaces configured in + * temp_tablespaces. + * + * Under the covers the set is one or more directories which will eventually + * be deleted when there are no backends attached. + */ +void +SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg) +{ + static uint32 counter = 0; + + SpinLockInit(&fileset->mutex); + fileset->refcnt = 1; + fileset->creator_pid = MyProcPid; + fileset->number = counter; + counter = (counter + 1) % INT_MAX; + + /* Capture the tablespace OIDs so that all backends agree on them. */ + PrepareTempTablespaces(); + fileset->ntablespaces = + GetTempTablespaces(&fileset->tablespaces[0], + lengthof(fileset->tablespaces)); + if (fileset->ntablespaces == 0) + { + fileset->tablespaces[0] = DEFAULTTABLESPACE_OID; + fileset->ntablespaces = 1; + } + + /* Register our cleanup callback. */ + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); +} + +/* + * Attach to a set of directories that was created with SharedFileSetInit. + */ +void +SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg) +{ + bool success; + + SpinLockAcquire(&fileset->mutex); + if (fileset->refcnt == 0) + success = false; + else + { + ++fileset->refcnt; + success = true; + } + SpinLockRelease(&fileset->mutex); + + if (!success) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to a SharedFileSet that is already destroyed"))); + + /* Register our cleanup callback. */ + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); +} + +/* + * Create a new file in the given set. + */ +File +SharedFileSetCreate(SharedFileSet *fileset, const char *name) +{ + char path[MAXPGPATH]; + File file; + + SharedFilePath(path, fileset, name); + file = PathNameCreateTemporaryFile(path, false); + + /* If we failed, see if we need to create the directory on demand. */ + if (file <= 0) + { + char tempdirpath[MAXPGPATH]; + char filesetpath[MAXPGPATH]; + Oid tablespace = ChooseTablespace(fileset, name); + + TempTablespacePath(tempdirpath, tablespace); + SharedFileSetPath(filesetpath, fileset, tablespace); + PathNameCreateTemporaryDir(tempdirpath, filesetpath); + file = PathNameCreateTemporaryFile(path, true); + } + + return file; +} + +/* + * Open a file that was created with SharedFileSetCreate(), possibly in + * another backend. + */ +File +SharedFileSetOpen(SharedFileSet *fileset, const char *name) +{ + char path[MAXPGPATH]; + File file; + + SharedFilePath(path, fileset, name); + file = PathNameOpenTemporaryFile(path); + + return file; +} + +/* + * Delete a file that was created with PathNameCreateShared(). + * Return true if the file existed, false if didn't. + */ +bool +SharedFileSetDelete(SharedFileSet *fileset, const char *name, + bool error_on_failure) +{ + char path[MAXPGPATH]; + + SharedFilePath(path, fileset, name); + + return PathNameDeleteTemporaryFile(path, error_on_failure); +} + +/* + * Delete all files in the set. + */ +void +SharedFileSetDeleteAll(SharedFileSet *fileset) +{ + char dirpath[MAXPGPATH]; + int i; + + /* + * Delete the directory we created in each tablespace. Doesn't fail + * because we use this in error cleanup paths, but can generate LOG + * message on IO error. + */ + for (i = 0; i < fileset->ntablespaces; ++i) + { + SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]); + PathNameDeleteTemporaryDir(dirpath); + } +} + +/* + * Callback function that will be invoked when this backend detaches from a + * DSM segment holding a SharedFileSet that it has created or attached to. If + * we are the last to detach, then try to remove the directories and + * everything in them. We can't raise an error on failures, because this runs + * in error cleanup paths. + */ +static void +SharedFileSetOnDetach(dsm_segment *segment, Datum datum) +{ + bool unlink_all = false; + SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum); + + SpinLockAcquire(&fileset->mutex); + Assert(fileset->refcnt > 0); + if (--fileset->refcnt == 0) + unlink_all = true; + SpinLockRelease(&fileset->mutex); + + /* + * If we are the last to detach, we delete the directory in all + * tablespaces. Note that we are still actually attached for the rest of + * this function so we can safely access its data. + */ + if (unlink_all) + SharedFileSetDeleteAll(fileset); +} + +/* + * Build the path for the directory holding the files backing a SharedFileSet + * in a given tablespace. + */ +static void +SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace) +{ + char tempdirpath[MAXPGPATH]; + + TempTablespacePath(tempdirpath, tablespace); + snprintf(path, MAXPGPATH, "%s/%s%d.%u.sharedfileset", + tempdirpath, PG_TEMP_FILE_PREFIX, + fileset->creator_pid, fileset->number); +} + +/* + * Sorting hat to determine which tablespace a given shared temporary file + * belongs in. + */ +static Oid +ChooseTablespace(const SharedFileSet *fileset, const char *name) +{ + uint32 hash = hash_any((const unsigned char *) name, strlen(name)); + + return fileset->tablespaces[hash % fileset->ntablespaces]; +} + +/* + * Compute the full path of a file in a SharedFileSet. + */ +static void +SharedFilePath(char *path, SharedFileSet *fileset, const char *name) +{ + char dirpath[MAXPGPATH]; + + SharedFileSetPath(dirpath, fileset, ChooseTablespace(fileset, name)); + snprintf(path, MAXPGPATH, "%s/%s", dirpath, name); +} diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index 640908717d..c3d7a61b64 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -26,6 +26,8 @@ #ifndef BUFFILE_H #define BUFFILE_H +#include "storage/sharedfileset.h" + /* BufFile is an opaque type whose details are not known outside buffile.c. */ typedef struct BufFile BufFile; @@ -42,4 +44,9 @@ extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence); extern void BufFileTell(BufFile *file, int *fileno, off_t *offset); extern int BufFileSeekBlock(BufFile *file, long blknum); +extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name); +extern void BufFileExportShared(BufFile *file); +extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name); +extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name); + #endif /* BUFFILE_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 6ea26e63b8..9829281509 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -79,6 +79,14 @@ extern int FileGetRawDesc(File file); extern int FileGetRawFlags(File file); extern mode_t FileGetRawMode(File file); +/* Operations used for sharing named temporary files */ +extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure); +extern File PathNameOpenTemporaryFile(const char *name); +extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure); +extern void PathNameCreateTemporaryDir(const char *base, const char *name); +extern void PathNameDeleteTemporaryDir(const char *name); +extern void TempTablespacePath(char *path, Oid tablespace); + /* Operations that allow use of regular stdio --- USE WITH CAUTION */ extern FILE *AllocateFile(const char *name, const char *mode); extern int FreeFile(FILE *file); @@ -107,6 +115,7 @@ extern void set_max_safe_fds(void); extern void closeAllVfds(void); extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces); extern bool TempTablespacesAreSet(void); +extern int GetTempTablespaces(Oid *tableSpaces, int numSpaces); extern Oid GetNextTempTableSpace(void); extern void AtEOXact_Files(void); extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, @@ -124,7 +133,7 @@ extern int durable_unlink(const char *fname, int loglevel); extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel); extern void SyncDataDirectory(void); -/* Filename components for OpenTemporaryFile */ +/* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILE_PREFIX "pgsql_tmp" diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h new file mode 100644 index 0000000000..20651bb93b --- /dev/null +++ b/src/include/storage/sharedfileset.h @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * sharedfileset.h + * Shared temporary file management. + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/sharedfilespace.h + * + *------------------------------------------------------------------------- + */ + +#ifndef SHAREDFILESET_H +#define SHAREDFILESET_H + +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/spin.h" + +/* + * A set of temporary files that can be shared by multiple backends. + */ +typedef struct SharedFileSet +{ + pid_t creator_pid; /* PID of the creating process */ + uint32 number; /* per-PID identifier */ + slock_t mutex; /* mutex protecting the reference count */ + int refcnt; /* number of attached backends */ + int ntablespaces; /* number of tablespaces to use */ + Oid tablespaces[8]; /* OIDs of tablespaces to use. Assumes that + * it's rare that there more than temp + * tablespaces. */ +} SharedFileSet; + +extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg); +extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg); +extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name); +extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name); +extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name, + bool error_on_failure); +extern void SharedFileSetDeleteAll(SharedFileSet *fileset); + +#endif diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 3e84720038..72eb9fd390 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2026,6 +2026,7 @@ SharedBitmapState SharedDependencyObjectType SharedDependencyType SharedExecutorInstrumentation +SharedFileSet SharedInvalCatalogMsg SharedInvalCatcacheMsg SharedInvalRelcacheMsg