Don't use O_DIRECT when writing WAL files if archiving or streaming is
enabled. Bypassing the kernel cache is counter-productive in that case, because the archiver/walsender process will read from the WAL file soon after it's written, and if it's not cached the read will cause a physical read, eating I/O bandwidth available on the WAL drive. Also, walreceiver process does unaligned writes, so disable O_DIRECT in walreceiver process for that reason too.
This commit is contained in:
parent
94f610b163
commit
ad458cfe81
@ -7,7 +7,7 @@
|
|||||||
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.376 2010/02/19 01:04:03 itagaki Exp $
|
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.377 2010/02/19 10:51:03 heikki Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -2686,13 +2686,10 @@ XLogFileClose(void)
|
|||||||
* WAL segment files will not be re-read in normal operation, so we advise
|
* WAL segment files will not be re-read in normal operation, so we advise
|
||||||
* the OS to release any cached pages. But do not do so if WAL archiving
|
* the OS to release any cached pages. But do not do so if WAL archiving
|
||||||
* or streaming is active, because archiver and walsender process could use
|
* or streaming is active, because archiver and walsender process could use
|
||||||
* the cache to read the WAL segment. Also, don't bother with it if we
|
* the cache to read the WAL segment.
|
||||||
* are using O_DIRECT, since the kernel is presumably not caching in that
|
|
||||||
* case.
|
|
||||||
*/
|
*/
|
||||||
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
|
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
|
||||||
if (!XLogIsNeeded() &&
|
if (!XLogIsNeeded())
|
||||||
(get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
|
|
||||||
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
|
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -7652,10 +7649,29 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
|
|||||||
static int
|
static int
|
||||||
get_sync_bit(int method)
|
get_sync_bit(int method)
|
||||||
{
|
{
|
||||||
|
int o_direct_flag = 0;
|
||||||
|
|
||||||
/* If fsync is disabled, never open in sync mode */
|
/* If fsync is disabled, never open in sync mode */
|
||||||
if (!enableFsync)
|
if (!enableFsync)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Optimize writes by bypassing kernel cache with O_DIRECT when using
|
||||||
|
* O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
|
||||||
|
* disabled, otherwise the archive command or walsender process will
|
||||||
|
* read the WAL soon after writing it, which is guaranteed to cause a
|
||||||
|
* physical read if we bypassed the kernel cache. We also skip the
|
||||||
|
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the
|
||||||
|
* same reason.
|
||||||
|
*
|
||||||
|
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
|
||||||
|
* written by walreceiver is normally read by the startup process soon
|
||||||
|
* after its written. Also, walreceiver performs unaligned writes, which
|
||||||
|
* don't work with O_DIRECT, so it is required for correctness too.
|
||||||
|
*/
|
||||||
|
if (!XLogIsNeeded() && !am_walreceiver)
|
||||||
|
o_direct_flag = PG_O_DIRECT;
|
||||||
|
|
||||||
switch (method)
|
switch (method)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
@ -7670,11 +7686,11 @@ get_sync_bit(int method)
|
|||||||
return 0;
|
return 0;
|
||||||
#ifdef OPEN_SYNC_FLAG
|
#ifdef OPEN_SYNC_FLAG
|
||||||
case SYNC_METHOD_OPEN:
|
case SYNC_METHOD_OPEN:
|
||||||
return OPEN_SYNC_FLAG;
|
return OPEN_SYNC_FLAG | o_direct_flag;
|
||||||
#endif
|
#endif
|
||||||
#ifdef OPEN_DATASYNC_FLAG
|
#ifdef OPEN_DATASYNC_FLAG
|
||||||
case SYNC_METHOD_OPEN_DSYNC:
|
case SYNC_METHOD_OPEN_DSYNC:
|
||||||
return OPEN_DATASYNC_FLAG;
|
return OPEN_DATASYNC_FLAG | o_direct_flag;
|
||||||
#endif
|
#endif
|
||||||
default:
|
default:
|
||||||
/* can't happen (unless we are out of sync with option array) */
|
/* can't happen (unless we are out of sync with option array) */
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.4 2010/02/17 04:19:39 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.5 2010/02/19 10:51:04 heikki Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -50,6 +50,9 @@
|
|||||||
#include "utils/ps_status.h"
|
#include "utils/ps_status.h"
|
||||||
#include "utils/resowner.h"
|
#include "utils/resowner.h"
|
||||||
|
|
||||||
|
/* Global variable to indicate if this process is a walreceiver process */
|
||||||
|
bool am_walreceiver;
|
||||||
|
|
||||||
/* libpqreceiver hooks to these when loaded */
|
/* libpqreceiver hooks to these when loaded */
|
||||||
walrcv_connect_type walrcv_connect = NULL;
|
walrcv_connect_type walrcv_connect = NULL;
|
||||||
walrcv_receive_type walrcv_receive = NULL;
|
walrcv_receive_type walrcv_receive = NULL;
|
||||||
@ -158,6 +161,8 @@ WalReceiverMain(void)
|
|||||||
/* use volatile pointer to prevent code rearrangement */
|
/* use volatile pointer to prevent code rearrangement */
|
||||||
volatile WalRcvData *walrcv = WalRcv;
|
volatile WalRcvData *walrcv = WalRcv;
|
||||||
|
|
||||||
|
am_walreceiver = true;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* WalRcv should be set up already (if we are a backend, we inherit
|
* WalRcv should be set up already (if we are a backend, we inherit
|
||||||
* this by fork() or EXEC_BACKEND mechanism from the postmaster).
|
* this by fork() or EXEC_BACKEND mechanism from the postmaster).
|
||||||
@ -424,16 +429,18 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
|
|||||||
bool use_existent;
|
bool use_existent;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XLOG segment files will be re-read in recovery operation soon,
|
* fsync() and close current file before we switch to next one.
|
||||||
* so we don't need to advise the OS to release any cache page.
|
* We would otherwise have to reopen this file to fsync it later
|
||||||
*/
|
*/
|
||||||
if (recvFile >= 0)
|
if (recvFile >= 0)
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
* fsync() before we switch to next file. We would otherwise
|
|
||||||
* have to reopen this file to fsync it later
|
|
||||||
*/
|
|
||||||
XLogWalRcvFlush();
|
XLogWalRcvFlush();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* XLOG segment files will be re-read by recovery in startup
|
||||||
|
* process soon, so we don't advise the OS to release cache
|
||||||
|
* pages associated with the file like XLogFileClose() does.
|
||||||
|
*/
|
||||||
if (close(recvFile) != 0)
|
if (close(recvFile) != 0)
|
||||||
ereport(PANIC,
|
ereport(PANIC,
|
||||||
(errcode_for_file_access(),
|
(errcode_for_file_access(),
|
||||||
@ -445,8 +452,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
|
|||||||
/* Create/use new log file */
|
/* Create/use new log file */
|
||||||
XLByteToSeg(recptr, recvId, recvSeg);
|
XLByteToSeg(recptr, recvId, recvSeg);
|
||||||
use_existent = true;
|
use_existent = true;
|
||||||
recvFile = XLogFileInit(recvId, recvSeg,
|
recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true);
|
||||||
&use_existent, true);
|
|
||||||
recvOff = 0;
|
recvOff = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.25 2010/01/15 09:19:06 heikki Exp $
|
* $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.26 2010/02/19 10:51:04 heikki Exp $
|
||||||
*/
|
*/
|
||||||
#ifndef XLOG_DEFS_H
|
#ifndef XLOG_DEFS_H
|
||||||
#define XLOG_DEFS_H
|
#define XLOG_DEFS_H
|
||||||
@ -106,23 +106,20 @@ typedef uint32 TimeLineID;
|
|||||||
* configure determined whether fdatasync() is.
|
* configure determined whether fdatasync() is.
|
||||||
*/
|
*/
|
||||||
#if defined(O_SYNC)
|
#if defined(O_SYNC)
|
||||||
#define BARE_OPEN_SYNC_FLAG O_SYNC
|
#define OPEN_SYNC_FLAG O_SYNC
|
||||||
#elif defined(O_FSYNC)
|
#elif defined(O_FSYNC)
|
||||||
#define BARE_OPEN_SYNC_FLAG O_FSYNC
|
#define OPEN_SYNC_FLAG O_FSYNC
|
||||||
#endif
|
|
||||||
#ifdef BARE_OPEN_SYNC_FLAG
|
|
||||||
#define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(O_DSYNC)
|
#if defined(O_DSYNC)
|
||||||
#if defined(OPEN_SYNC_FLAG)
|
#if defined(OPEN_SYNC_FLAG)
|
||||||
/* O_DSYNC is distinct? */
|
/* O_DSYNC is distinct? */
|
||||||
#if O_DSYNC != BARE_OPEN_SYNC_FLAG
|
#if O_DSYNC != OPEN_SYNC_FLAG
|
||||||
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
|
#define OPEN_DATASYNC_FLAG O_DSYNC
|
||||||
#endif
|
#endif
|
||||||
#else /* !defined(OPEN_SYNC_FLAG) */
|
#else /* !defined(OPEN_SYNC_FLAG) */
|
||||||
/* Win32 only has O_DSYNC */
|
/* Win32 only has O_DSYNC */
|
||||||
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
|
#define OPEN_DATASYNC_FLAG O_DSYNC
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
*
|
*
|
||||||
* Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
|
* Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.6 2010/02/03 09:47:19 heikki Exp $
|
* $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.7 2010/02/19 10:51:04 heikki Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -15,6 +15,8 @@
|
|||||||
#include "access/xlogdefs.h"
|
#include "access/xlogdefs.h"
|
||||||
#include "storage/spin.h"
|
#include "storage/spin.h"
|
||||||
|
|
||||||
|
extern bool am_walreceiver;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* MAXCONNINFO: maximum size of a connection string.
|
* MAXCONNINFO: maximum size of a connection string.
|
||||||
*
|
*
|
||||||
|
Loading…
x
Reference in New Issue
Block a user