Use a db(3) hash database instead of shared memory. Don't fork() to service

requests in the database when restarting.
This commit is contained in:
christos 1997-10-21 20:38:03 +00:00
parent 890b485672
commit 1175f55b11
3 changed files with 495 additions and 314 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: stat_proc.c,v 1.3 1997/10/17 16:12:48 lukem Exp $ */
/* $NetBSD: stat_proc.c,v 1.4 1997/10/21 20:38:03 christos Exp $ */
/*
* Copyright (c) 1995
@ -35,7 +35,7 @@
#include <sys/cdefs.h>
#ifndef lint
__RCSID("$NetBSD: stat_proc.c,v 1.3 1997/10/17 16:12:48 lukem Exp $");
__RCSID("$NetBSD: stat_proc.c,v 1.4 1997/10/21 20:38:03 christos Exp $");
#endif
#include <errno.h>
@ -44,14 +44,13 @@ __RCSID("$NetBSD: stat_proc.c,v 1.3 1997/10/17 16:12:48 lukem Exp $");
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <signal.h>
#include <unistd.h>
#include <rpc/rpc.h>
#include "statd.h"
static int do_unmon __P((HostInfo *, my_id *));
/* sm_stat_1 --------------------------------------------------------------- */
/*
* Purpose: RPC call to enquire if a host can be monitored
@ -65,6 +64,7 @@ sm_stat_1_svc(arg, req)
{
static sm_stat_res res;
NO_ALARM;
if (debug)
syslog(LOG_DEBUG, "stat called for host %s", arg->mon_name);
@ -76,7 +76,8 @@ sm_stat_1_svc(arg, req)
res.res_stat = stat_fail;
}
res.state = status_info->ourState;
res.state = status_info.ourState;
ALARM;
return (&res);
}
@ -94,9 +95,10 @@ sm_mon_1_svc(arg, req)
struct svc_req *req;
{
static sm_stat_res res;
HostInfo *hp;
HostInfo *hp, h;
MonList *lp;
NO_ALARM;
if (debug) {
syslog(LOG_DEBUG, "monitor request for host %s",
arg->mon_id.mon_name);
@ -105,35 +107,40 @@ sm_mon_1_svc(arg, req)
arg->mon_id.my_id.my_vers, arg->mon_id.my_id.my_proc);
}
res.res_stat = stat_fail; /* Assume fail until set otherwise */
res.state = status_info->ourState;
res.state = status_info.ourState;
/*
* Find existing host entry, or create one if not found. If
* find_host() fails, it will have logged the error already.
*/
if (!gethostbyname(arg->mon_id.mon_name))
if (!gethostbyname(arg->mon_id.mon_name)) {
syslog(LOG_ERR, "Invalid hostname to sm_mon: %s",
arg->mon_id.mon_name);
else if ((hp = find_host(arg->mon_id.mon_name, TRUE)) != NULL) {
lp = (MonList *)malloc(sizeof(MonList));
if (!lp)
syslog(LOG_ERR, "Out of memory");
else {
strncpy(lp->notifyHost, arg->mon_id.my_id.my_name,
SM_MAXSTRLEN);
lp->notifyProg = arg->mon_id.my_id.my_prog;
lp->notifyVers = arg->mon_id.my_id.my_vers;
lp->notifyProc = arg->mon_id.my_id.my_proc;
memcpy(lp->notifyData, arg->priv,
sizeof(lp->notifyData));
lp->next = hp->monList;
hp->monList = lp;
sync_file();
res.res_stat = stat_succ; /* Report success */
}
return &res;
}
if ((hp = find_host(arg->mon_id.mon_name, &h)) == NULL)
memset(hp = &h, 0, sizeof(h));
lp = (MonList *)malloc(sizeof(MonList));
if (!lp)
syslog(LOG_ERR, "Out of memory");
else {
strncpy(lp->notifyHost, arg->mon_id.my_id.my_name,
SM_MAXSTRLEN);
lp->notifyProg = arg->mon_id.my_id.my_prog;
lp->notifyVers = arg->mon_id.my_id.my_vers;
lp->notifyProc = arg->mon_id.my_id.my_proc;
memcpy(lp->notifyData, arg->priv,
sizeof(lp->notifyData));
lp->next = hp->monList;
hp->monList = lp;
change_host(arg->mon_id.mon_name, hp);
sync_file();
res.res_stat = stat_succ; /* Report success */
}
ALARM;
return (&res);
}
@ -145,11 +152,13 @@ sm_mon_1_svc(arg, req)
* In the unlikely event of more than one identical monitor
* request, all are removed.
*/
static int
do_unmon(hp, idp)
int
do_unmon(name, hp, ptr)
char *name;
HostInfo *hp;
my_id *idp;
void *ptr;
{
my_id *idp = ptr;
MonList *lp, *next;
MonList *last = NULL;
int result = FALSE;
@ -190,8 +199,9 @@ sm_unmon_1_svc(arg, req)
struct svc_req *req;
{
static sm_stat res;
HostInfo *hp;
HostInfo *hp, h;
NO_ALARM;
if (debug) {
syslog(LOG_DEBUG, "un-monitor request for host %s",
arg->mon_name);
@ -199,9 +209,11 @@ sm_unmon_1_svc(arg, req)
arg->my_id.my_name, arg->my_id.my_prog,
arg->my_id.my_vers, arg->my_id.my_proc);
}
if ((hp = find_host(arg->mon_name, FALSE)) != NULL) {
if (do_unmon(hp, &arg->my_id))
if ((hp = find_host(arg->mon_name, &h)) != NULL) {
if (do_unmon(arg->mon_name, hp, &arg->my_id)) {
change_host(arg->mon_name, hp);
sync_file();
}
else
syslog(LOG_ERR,
"unmon request from %s, no matching monitor",
@ -210,7 +222,8 @@ sm_unmon_1_svc(arg, req)
syslog(LOG_ERR, "unmon request from %s for unknown host %s",
arg->my_id.my_name, arg->mon_name);
res.state = status_info->ourState;
res.state = status_info.ourState;
ALARM;
return (&res);
}
@ -228,21 +241,19 @@ sm_unmon_all_1_svc(arg, req)
struct svc_req *req;
{
static sm_stat res;
HostInfo *hp;
int i;
NO_ALARM;
if (debug) {
syslog(LOG_DEBUG,
"unmon_all for host: %s prog: %d ver: %d proc: %d",
arg->my_name, arg->my_prog, arg->my_vers, arg->my_proc);
}
for (i = status_info->noOfHosts, hp = status_info->hosts; i; i--, hp++)
do_unmon(hp, arg);
unmon_hosts();
sync_file();
res.state = status_info->ourState;
res.state = status_info.ourState;
ALARM;
return (&res);
}
@ -266,30 +277,14 @@ sm_simu_crash_1_svc(v, req)
struct svc_req *req;
{
static char dummy;
int work_to_do;
HostInfo *hp;
int i;
work_to_do = 0;
NO_ALARM;
if (debug)
syslog(LOG_DEBUG, "simu_crash called!!");
/*
* Simulate crash by setting notify-required flag on all monitored
* hosts, and incrementing our status number. notify_hosts() is
* then called to fork a process to do the notifications.
*/
for (i = status_info->noOfHosts, hp = status_info->hosts; i > 0;
i--, hp++) {
if (hp->monList) {
work_to_do = TRUE;
hp->notifyReqd = TRUE;
}
}
status_info->ourState += 2; /* always even numbers if not crashed */
if (work_to_do)
notify_hosts();
reset_database();
ALARM;
notify_handler(0);
return (&dummy);
}
@ -319,14 +314,14 @@ sm_notify_1_svc(arg, req)
static char dummy;
status tx_arg; /* arg sent to callback procedure */
MonList *lp;
HostInfo *hp;
HostInfo *hp, h;
pid_t pid;
if (debug)
syslog(LOG_DEBUG, "notify from host %s, new state %d",
arg->mon_name, arg->state);
hp = find_host(arg->mon_name, FALSE);
hp = find_host(arg->mon_name, &h);
if (!hp) {
/* Never heard of this host - why is it notifying us? */
syslog(LOG_ERR, "Unsolicited notification from host %s",

View File

@ -1,6 +1,7 @@
/* $NetBSD: statd.c,v 1.8 1997/10/21 13:33:23 is Exp $ */
/* $NetBSD: statd.c,v 1.9 1997/10/21 20:38:11 christos Exp $ */
/*
* Copyright (c) 1997 Christos Zoulas. All rights reserved.
* Copyright (c) 1995
* A.R. Gordon (andrew.gordon@net-tel.co.uk). All rights reserved.
*
@ -15,6 +16,7 @@
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the FreeBSD project
* This product includes software developed by Christos Zoulas.
* 4. Neither the name of the author nor the names of any co-contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
@ -35,7 +37,7 @@
#include <sys/cdefs.h>
#ifndef lint
__RCSID("$NetBSD: statd.c,v 1.8 1997/10/21 13:33:23 is Exp $");
__RCSID("$NetBSD: statd.c,v 1.9 1997/10/21 20:38:11 christos Exp $");
#endif
@ -43,11 +45,8 @@ __RCSID("$NetBSD: statd.c,v 1.8 1997/10/21 13:33:23 is Exp $");
/* file was generated by running rpcgen /usr/include/rpcsvc/sm_inter.x */
/* The actual program logic is in the file procs.c */
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <err.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
@ -56,21 +55,39 @@ __RCSID("$NetBSD: statd.c,v 1.8 1997/10/21 13:33:23 is Exp $");
#include <string.h>
#include <syslog.h>
#include <unistd.h>
#include <db.h>
#include <rpc/rpc.h>
#include "statd.h"
struct sigaction sa;
int debug = 0; /* Controls syslog() for debug msgs */
int _rpcsvcdirty = 0; /* XXX ??? */
FileLayout *status_info; /* Pointer to mmap()ed status file */
static int status_fd; /* File descriptor for the open file */
static off_t status_file_len; /* Current on-disc length of file */
static DB *db; /* Database file */
int main __P((int, char **));
static void handle_sigchld __P((int));
static int notify_one_host __P((char *));
extern void sm_prog_1 __P((struct svc_req *, SVCXPRT *));
Header status_info;
static char undefdata[] = "\0\1\2\3\4\5\6\7";
static DBT undefkey = {
undefdata,
sizeof(undefdata)
};
extern char *__progname;
/* statd.c */
static int walk_one __P((int (*fun )__P ((DBT *, DBT *, void *)), DBT *, DBT *, void *));
static int walk_db __P((int (*fun )__P ((DBT *, DBT *, void *)), void *));
static int reset_host __P((DBT *, DBT *, void *));
static int check_work __P((DBT *, DBT *, void *));
static int unmon_host __P((DBT *, DBT *, void *));
static int notify_one __P((DBT *, DBT *, void *));
static void init_file __P((char *));
static int notify_one_host __P((char *));
static void die __P((int)) __attribute__((__noreturn__));
int main __P((int, char **));
int
main(argc, argv)
@ -78,7 +95,6 @@ main(argc, argv)
char **argv;
{
SVCXPRT *transp;
struct sigaction sa;
int ch;
while ((ch = getopt(argc, argv, "d")) != (-1)) {
@ -88,7 +104,7 @@ main(argc, argv)
break;
default:
case '?':
errx(1, "usage: rpc.statd [-d]");
(void) fprintf(stderr, "Usage: %s [-d]", __progname);
/* NOTREACHED */
}
}
@ -118,35 +134,40 @@ main(argc, argv)
* Note that it is NOT sensible to run this program from inetd - the
* protocol assumes that it will run immediately at boot time.
*/
daemon(0, 0);
if (!debug)
daemon(0, 0);
openlog("rpc.statd", 0, LOG_DAEMON);
if (debug)
syslog(LOG_INFO, "Starting - debug enabled");
else
syslog(LOG_INFO, "Starting");
/* Install signal handler to collect exit status of child processes */
sa.sa_handler = handle_sigchld;
sa.sa_handler = die;
sa.sa_flags = 0;
sigemptyset(&sa.sa_mask);
sigaddset(&sa.sa_mask, SIGCHLD);
(void)sigaction(SIGTERM, &sa, NULL);
(void)sigaction(SIGQUIT, &sa, NULL);
(void)sigaction(SIGHUP, &sa, NULL);
(void)sigaction(SIGINT, &sa, NULL);
sa.sa_handler = SIG_IGN;
sa.sa_flags = SA_RESTART;
sigaction(SIGCHLD, &sa, NULL);
sigemptyset(&sa.sa_mask);
sigaddset(&sa.sa_mask, SIGALRM);
/* Initialisation now complete - start operating */
/*
* notify_hosts() forks a process (if necessary) to do the
* SM_NOTIFY calls, which may be slow.
*/
notify_hosts();
/* Notify hosts that need it */
notify_handler(0);
svc_run(); /* Should never return */
exit(1);
while (1)
svc_run(); /* Should never return */
die(0);
}
/* handle_sigchld ---------------------------------------------------------- */
/* notify_handler ---------------------------------------------------------- */
/*
* Purpose: Catch SIGCHLD and collect process status
* Purpose: Catch SIGALRM and collect process status
* Returns: Nothing.
* Notes: No special action required, other than to collect the
* process status and hence allow the child to die:
@ -154,19 +175,30 @@ main(argc, argv)
* of SM_NOTIFY to other systems, so it is normal for the
* children to exit when they have done their work.
*/
static void
handle_sigchld(sig)
void
notify_handler(sig)
int sig;
{
int pid, status;
pid = wait4(-1, &status, WNOHANG, (struct rusage *) 0);
if (!pid)
syslog(LOG_ERR, "Phantom SIGCHLD??");
else if (status)
syslog(LOG_ERR, "Child %d failed with status %d", pid,
WEXITSTATUS(status));
else if (debug)
syslog(LOG_DEBUG, "Child %d exited OK", pid);
time_t now;
NO_ALARM;
sa.sa_handler = SIG_IGN;
(void)sigaction(SIGALRM, &sa, NULL);
now = time(NULL);
(void) walk_db(notify_one, &now);
if (walk_db(check_work, &now) == 0) {
/*
* No more work to be done.
*/
CLR_ALARM;
return;
}
sync_file();
ALARM;
alarm(5);
}
/* sync_file --------------------------------------------------------------- */
@ -177,72 +209,296 @@ handle_sigchld(sig)
void
sync_file()
{
if (msync((caddr_t)status_info, 0, 0) < 0)
syslog(LOG_ERR, "msync() failed: %s", strerror(errno));
DBT data;
data.data = &status_info;
data.size = sizeof(status_info);
switch ((*db->put)(db, &undefkey, &data, 0)) {
case 0:
return;
case -1:
goto bad;
default:
abort();
}
if ((*db->sync)(db, 0) == -1) {
bad:
syslog(LOG_ERR, "database corrupted %m");
die(1);
}
}
/* change_host -------------------------------------------------------------- */
/*
* Purpose: Update/Create an entry for host
* Returns: Nothing
* Notes:
*
*/
void
change_host(hostname, hp)
char *hostname;
HostInfo *hp;
{
DBT key, data;
char *ptr;
for (ptr = hostname; *ptr; ptr++)
if (isupper((unsigned char) *ptr))
*ptr = tolower((unsigned char) *ptr);
key.data = hostname;
key.size = ptr - hostname + 1;
data.data = hp;
data.size = sizeof(*hp);
switch ((*db->put)(db, &key, &data, 0)) {
case -1:
syslog(LOG_ERR, "database corrupted %m");
die(1);
case 0:
return;
default:
abort();
}
}
/* find_host -------------------------------------------------------------- */
/*
* Purpose: Find the entry in the status file for a given host
* Returns: Pointer to that entry in the mmap() region, or NULL.
* Notes: Also creates entries if requested.
* Failure to create also returns NULL.
* Returns: Copy of entry in hd, or NULL
* Notes:
*
*/
HostInfo *
find_host(hostname, create)
find_host(hostname, hp)
char *hostname;
int create;
{
HostInfo *hp;
HostInfo *spare_slot = NULL;
HostInfo *result = NULL;
int i;
{
DBT key, data;
char *ptr;
for (i = 0, hp = status_info->hosts; i < status_info->noOfHosts;
i++, hp++) {
if (!strncasecmp(hostname, hp->hostname, SM_MAXSTRLEN)) {
result = hp;
for (ptr = hostname; *ptr; ptr++)
if (isupper((unsigned char) *ptr))
*ptr = tolower((unsigned char) *ptr);
key.data = hostname;
key.size = ptr - hostname + 1;
switch ((*db->get)(db, &key, &data, 0)) {
case 0:
if (data.size != sizeof(*hp))
goto bad;
return memcpy(hp, data.data, sizeof(*hp));
case 1:
return NULL;
case -1:
goto bad;
default:
abort();
}
bad:
syslog(LOG_ERR, "Database corrupted %m");
return NULL;
}
/* walk_one ------------------------------------------------------------- */
/*
* Purpose: Call the given function if the element is valid
* Returns: Nothing - exits on error
* Notes:
*/
static int
walk_one(fun, key, data, ptr)
int (*fun) __P((DBT *, DBT *, void *));
DBT *key, *data;
void *ptr;
{
if (key->size == undefkey.size &&
memcmp(key->data, undefkey.data, key->size) == 0)
return 0;
if (data->size != sizeof(HostInfo)) {
syslog(LOG_ERR, "Bad data in database");
die(1);
}
return (*fun)(key, data, ptr);
}
/* walk_db -------------------------------------------------------------- */
/*
* Purpose: Iterate over all elements calling the given function
* Returns: -1 if function failed, 0 on success
* Notes:
*/
static int
walk_db(fun, ptr)
int (*fun) __P((DBT *, DBT *, void *));
void *ptr;
{
DBT key, data;
switch ((*db->seq)(db, &key, &data, R_FIRST)) {
case -1:
goto bad;
case 1:
/* We should have at least the magic entry at this point */
abort();
case 0:
if (walk_one(fun, &key, &data, ptr) == -1)
return -1;
break;
default:
abort();
}
for (;;)
switch ((*db->seq)(db, &key, &data, R_NEXT)) {
case -1:
goto bad;
case 1:
if (walk_one(fun, &key, &data, ptr) == -1)
return -1;
break;
case 0:
return 0;
default:
abort();
}
if (!spare_slot && !hp->monList && !hp->notifyReqd)
spare_slot = hp;
bad:
syslog(LOG_ERR, "Corrupted database %m");
die(1);
}
/* reset_host ------------------------------------------------------------ */
/*
* Purpose: Clean up existing hosts in file.
* Returns: Always success 0.
* Notes: Clean-up of existing file - monitored hosts will have a
* pointer to a list of clients, which refers to memory in
* the previous incarnation of the program and so are
* meaningless now. These pointers are zeroed and the fact
* that the host was previously monitored is recorded by
* setting the notifyReqd flag, which will in due course
* cause a SM_NOTIFY to be sent.
*
* Note that if we crash twice in quick succession, some hosts
* may already have notifyReqd set, where we didn't manage to
* notify them before the second crash occurred.
*/
static int
reset_host(key, data, ptr)
DBT *key, *data;
void *ptr;
{
HostInfo *hi = data->data;
if (hi->monList) {
hi->notifyReqd = *(time_t *) data;
hi->attempts = 0;
hi->monList = NULL;
}
return 0;
}
/* Return if entry found, or if not asked to create one. */
if (result || !create)
return (result);
/* check_work ------------------------------------------------------------ */
/*
* Purpose: Check if there is work to be done.
* Returns: 0 if there is no work to be done -1 if there is.
* Notes:
*/
static int
check_work(key, data, ptr)
DBT *key, *data;
void *ptr;
{
HostInfo *hi = data->data;
/*
* Now create an entry, using the spare slot if one was found or
* adding to the end of the list otherwise, extending file if req'd
*/
if (!spare_slot) {
off_t desired_size;
spare_slot = &status_info->hosts[status_info->noOfHosts];
desired_size = ((char *)spare_slot - (char *)status_info) +
sizeof(HostInfo);
return hi->notifyReqd ? -1 : 0;
}
if (desired_size > status_file_len) {
/* Extend file by writing 1 byte of junk at the
* desired end pos */
lseek(status_fd, desired_size - 1, SEEK_SET);
i = write(status_fd, &i, 1);
if (i < 1) {
syslog(LOG_ERR, "Unable to extend status file");
return (NULL);
}
status_file_len = desired_size;
/* unmon_host ------------------------------------------------------------ */
/*
* Purpose: Unmonitor a host
* Returns: 0
* Notes:
*/
static int
unmon_host(key, data, ptr)
DBT *key, *data;
void *ptr;
{
char *name = key->data;
HostInfo *hi = data->data;
if (do_unmon(name, hi, ptr))
change_host(name, hi);
return 0;
}
/* notify_one ------------------------------------------------------------ */
/*
* Purpose: Notify one host.
* Returns: 0 if success -1 on failure
* Notes:
*/
static int
notify_one(key, data, ptr)
DBT *key, *data;
void *ptr;
{
time_t now = *(time_t *) ptr;
char *name = key->data;
HostInfo *hi = data->data;
if (hi->notifyReqd == 0 || hi->notifyReqd > now)
return 0;
if (notify_one_host(name)) {
give_up:
hi->notifyReqd = 0;
hi->attempts = 0;
switch ((*db->put)(db, key, data, 0)) {
case -1:
syslog(LOG_ERR, "Error storing %s (%m)", name);
case 0:
return 0;
default:
abort();
}
status_info->noOfHosts++;
}
/*
* Initialise the spare slot that has been found/created
* Note that we do not msync(), since the caller is presumed to be
* about to modify the entry further
*/
memset(spare_slot, 0, sizeof(HostInfo));
strncpy(spare_slot->hostname, hostname, SM_MAXSTRLEN);
return (spare_slot);
else {
/*
* If one of the initial attempts fails, we wait
* for a while and have another go. This is necessary
* because when we have crashed, (eg. a power outage)
* it is quite possible that we won't be able to
* contact all monitored hosts immediately on restart,
* either because they crashed too and take longer
* to come up (in which case the notification isn't
* really required), or more importantly if some
* router etc. needed to reach the monitored host
* has not come back up yet. In this case, we will
* be a bit late in re-establishing locks (after the
* grace period) but that is the best we can do. We
* try 10 times at 5 sec intervals, 10 more times at
* 1 minute intervals, then 24 more times at hourly
* intervals, finally giving up altogether if the
* host hasn't come back to life after 24 hours.
*/
if (hi->attempts++ >= 44)
goto give_up;
else if (hi->attempts < 10)
hi->notifyReqd += 5;
else if (hi->attempts < 20)
hi->notifyReqd += 60;
else
hi->notifyReqd += 60 * 60;
return -1;
}
}
/* init_file -------------------------------------------------------------- */
@ -257,89 +513,73 @@ find_host(hostname, create)
* all hosts that had a monitor list, and incrementing
* the state number to the next even value.
*/
void
static void
init_file(filename)
char *filename;
{
char buf[HEADER_LEN];
int new_file = FALSE;
int i;
DBT data;
/* try to open existing file - if not present, create one */
status_fd = open(filename, O_RDWR);
if ((status_fd < 0) && (errno == ENOENT)) {
status_fd = open(filename, O_RDWR | O_CREAT, 0644);
new_file = TRUE;
}
if (status_fd < 0) {
err(1, "unable to open status file %s", filename);
/* NOTREACHED */
db = dbopen(filename, O_RDWR|O_CREAT|O_NDELAY|O_EXLOCK, 644, DB_HASH,
NULL);
if (db == NULL)
err(1, "Cannot open `%s'", filename);
switch ((*db->get)(db, &undefkey, &data, 0)) {
case 1:
/* New database */
(void)memset(&status_info, 0, sizeof(status_info));
sync_file();
return;
case -1:
err(1, "error accessing database (%m)");
case 0:
/* Existing database */
if (data.size != sizeof(status_info))
errx(1, "database corrupted %d != %d",
data.size, sizeof(status_info));
break;
default:
abort();
}
/*
* File now open. mmap() it, with a generous size to allow for
* later growth, where we will extend the file but not re-map it.
*/
status_info = (FileLayout *)mmap(NULL, 0x1000000,
PROT_READ | PROT_WRITE, MAP_SHARED, status_fd, 0);
if (status_info == (FileLayout *)(-1)) {
perror("rpc.statd");
fprintf(stderr, "Unable to mmap() status file\n");
}
status_file_len = lseek(status_fd, 0L, SEEK_END);
/*
* If the file was not newly created, validate the contents, and if
* defective, re-create from scratch.
*/
if (!new_file) {
if ((status_file_len < HEADER_LEN) || (status_file_len <
(HEADER_LEN + sizeof(HostInfo) * status_info->noOfHosts))) {
fprintf(stderr, "rpc.statd: status file is corrupt\n");
new_file = TRUE;
}
}
/* Initialisation of a new, empty file. */
if (new_file) {
memset(buf, 0, sizeof(buf));
lseek(status_fd, 0L, SEEK_SET);
write(status_fd, buf, HEADER_LEN);
status_file_len = HEADER_LEN;
} else {
/*
* Clean-up of existing file - monitored hosts will have a
* pointer to a list of clients, which refers to memory in
* the previous incarnation of the program and so are
* meaningless now. These pointers are zeroed and the fact
* that the host was previously monitored is recorded by
* setting the notifyReqd flag, which will in due course
* cause a SM_NOTIFY to be sent.
*
* Note that if we crash twice in quick succession, some hosts
* may already have notifyReqd set, where we didn't manage to
* notify them before the second crash occurred.
*/
for (i = 0; i < status_info->noOfHosts; i++) {
HostInfo *this_host = &status_info->hosts[i];
if (this_host->monList) {
this_host->notifyReqd = TRUE;
this_host->monList = NULL;
}
}
/* Select the next higher even number for the state counter */
status_info->ourState =
(status_info->ourState + 2) & 0xfffffffe;
status_info->ourState++; /* XXX - ??? */
}
reset_database();
return;
}
/* notify_one_host --------------------------------------------------------- */
/* reset_database --------------------------------------------------------- */
/*
* Purpose: Perform SM_NOTIFY procedure at specified host
* Returns: TRUE if success, FALSE if failed.
* Purpose: Clears the statd database
* Returns: Nothing
* Notes: If this is not called on reset, it will leak memory.
*/
void
reset_database()
{
time_t now = time(NULL);
walk_db(reset_host, &now);
/* Select the next higher even number for the state counter */
status_info.ourState =
(status_info.ourState + 2) & 0xfffffffe;
status_info.ourState++; /* XXX - ??? */
sync_file();
}
/* unmon_hosts --------------------------------------------------------- */
/*
* Purpose: Unmonitor all the hosts
* Returns: Nothing
* Notes:
*/
void
unmon_hosts()
{
time_t now = time(NULL);
walk_db(unmon_host, &now);
sync_file();
}
static int
notify_one_host(hostname)
char *hostname;
@ -353,7 +593,7 @@ notify_one_host(hostname)
gethostname(our_hostname, sizeof(our_hostname));
our_hostname[SM_MAXSTRLEN] = '\0';
arg.mon_name = our_hostname;
arg.state = status_info->ourState;
arg.state = status_info.ourState;
if (debug)
syslog(LOG_DEBUG, "Sending SM_NOTIFY to host %s from %s",
@ -376,81 +616,11 @@ notify_one_host(hostname)
return (TRUE);
}
/* notify_hosts ------------------------------------------------------------ */
/*
* Purpose: Send SM_NOTIFY to all hosts marked as requiring it
* Returns: Nothing, immediately - forks a process to do the work.
* Notes: Does nothing if there are no monitored hosts.
* Called after all the initialisation has been done -
* logs to syslog.
*/
void
notify_hosts(void)
static void
die(n)
int n;
{
HostInfo *hp;
int i, attempts;
int work_to_do = FALSE;
pid_t pid;
/* First check if there is in fact any work to do. */
for (i = status_info->noOfHosts, hp = status_info->hosts; i;
i--, hp++) {
if (hp->notifyReqd) {
work_to_do = TRUE;
break;
}
}
if (!work_to_do)
return; /* No work found */
pid = fork();
if (pid == -1) {
syslog(LOG_ERR, "Unable to fork notify process - %s",
strerror(errno));
return;
}
if (pid)
return;
/*
* Here in the child process. We continue until all the hosts marked
* as requiring notification have been duly notified.
* If one of the initial attempts fails, we sleep for a while and
* have another go. This is necessary because when we have crashed,
* (eg. a power outage) it is quite possible that we won't be able to
* contact all monitored hosts immediately on restart, either because
* they crashed too and take longer to come up (in which case the
* notification isn't really required), or more importantly if some
* router etc. needed to reach the monitored host has not come back
* up yet. In this case, we will be a bit late in re-establishing
* locks (after the grace period) but that is the best we can do.
* We try 10 times at 5 sec intervals, 10 more times at 1 minute
* intervals, then 24 more times at hourly intervals, finally
* giving up altogether if the host hasn't come back to life after
* 24 hours.
*/
for (attempts = 0; attempts < 44; attempts++) {
work_to_do = FALSE; /* Unless anything fails */
for (i = status_info->noOfHosts, hp = status_info->hosts; i > 0;
i--, hp++) {
if (hp->notifyReqd) {
if (notify_one_host(hp->hostname)) {
hp->notifyReqd = FALSE;
sync_file();
} else
work_to_do = TRUE;
}
}
if (!work_to_do)
break;
if (attempts < 10)
sleep(5);
else
if (attempts < 20)
sleep(60);
else
sleep(60 * 60);
}
exit(0);
(*db->close)(db);
exit(n);
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: statd.h,v 1.1 1997/03/10 06:28:32 scottr Exp $ */
/* $NetBSD: statd.h,v 1.2 1997/10/21 20:38:19 christos Exp $ */
/*
* Copyright (c) 1995
@ -52,7 +52,7 @@
*
* We handle this by keeping the list of monitored hosts in a file
* (/var/statd.state) which is mmap()ed and whose format is described
* by the typedef FileLayout. The lists of client callbacks are chained
* by the typedef Header. The lists of client callbacks are chained
* off this structure, but are held in normal memory and so will be
* lost after a re-boot. Hence the actual values of MonList * pointers
* in the copy on disc have no significance, but their NULL/non-NULL
@ -70,9 +70,9 @@ typedef struct MonList_s {
} MonList;
typedef struct {
char hostname[SM_MAXSTRLEN + 1]; /* Name of monitored host */
int notifyReqd; /* TRUE if we've crashed and not yet
int notifyReqd; /* Time of our next attempt or 0
informed the monitored host */
int attempts; /* Number of attempts we tried so far */
MonList *monList; /* List of clients to inform if we
hear that the monitored host has
crashed, NULL if no longer monitored */
@ -82,23 +82,39 @@ typedef struct {
/* Overall file layout. */
typedef struct {
int magic; /* Zero magic */
int ourState; /* State number as defined in statd protocol */
int noOfHosts; /* Number of elements in hosts[] */
char reserved[248]; /* Reserved for future use */
HostInfo hosts[1]; /* vector of monitored hosts */
} FileLayout;
#define HEADER_LEN (sizeof(FileLayout) - sizeof(HostInfo))
} Header;
/* ------------------------------------------------------------------------- */
/* Global variables */
extern FileLayout *status_info; /* The mmap()ed status file */
extern int debug; /* = 1 to enable diagnostics to syslog */
extern struct sigaction sa;
extern Header status_info;
/* Function prototypes */
extern HostInfo *find_host __P((char *hostname, int create));
extern void init_file __P((char *filename));
extern void notify_hosts __P((void));
extern void sync_file __P((void));
/* stat_proc.c */
struct sm_stat_res *sm_stat_1_svc __P((sm_name *, struct svc_req *));
struct sm_stat_res *sm_mon_1_svc __P((mon *, struct svc_req *));
struct sm_stat *sm_unmon_1_svc __P((mon_id *, struct svc_req *));
struct sm_stat *sm_unmon_all_1_svc __P((my_id *, struct svc_req *));
void *sm_simu_crash_1_svc __P((void *, struct svc_req *));
void *sm_notify_1_svc __P((stat_chge *, struct svc_req *));
int do_unmon __P((char *, HostInfo *, void *));
/* statd.c */
void notify_handler __P((int));
void sync_file __P((void));
void unmon_hosts __P((void));
void change_host __P((char *, HostInfo *));
HostInfo *find_host __P((char *, HostInfo *));
void reset_database __P((void));
void sm_prog_1 __P((struct svc_req *, SVCXPRT *));
#define NO_ALARM sa.sa_handler == SIG_DFL ? 0 : (sa.sa_handler = SIG_IGN, sigaction(SIGALRM, &sa, NULL))
#define ALARM sa.sa_handler == SIG_DFL ? 0 : (sa.sa_handler = notify_handler, sigaction(SIGALRM, &sa, NULL))
#define CLR_ALARM sa.sa_handler == SIG_DFL ? 0 : (sa.sa_handler = SIG_DFL, sigaction(SIGALRM, &sa, NULL))