svn commit: r234991 - in user/yar: . rcguard
Yar Tikhiy
yar at FreeBSD.org
Fri May 4 01:35:14 UTC 2012
Author: yar
Date: Fri May 4 01:35:13 2012
New Revision: 234991
URL: http://svn.freebsd.org/changeset/base/234991
Log:
Publish my stab at keeping an rc.d service running
and restarting it in case it crashes.
Added:
user/yar/
user/yar/rcguard/
user/yar/rcguard/Makefile (contents, props changed)
user/yar/rcguard/rcguard.8 (contents, props changed)
user/yar/rcguard/rcguard.c (contents, props changed)
Added: user/yar/rcguard/Makefile
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/yar/rcguard/Makefile Fri May 4 01:35:13 2012 (r234991)
@@ -0,0 +1,11 @@
+PROG= rcguard
+MAN= rcguard.8
+
+DPADD= ${LIBUTIL}
+LDADD= -lutil
+
+# To be removed if accepted for the FreeBSD repo
+BINDIR?= /sbin
+WARNS?= 100
+
+.include <bsd.prog.mk>
Added: user/yar/rcguard/rcguard.8
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/yar/rcguard/rcguard.8 Fri May 4 01:35:13 2012 (r234991)
@@ -0,0 +1,172 @@
+.\" Copyright (c) 2012 Yar Tikhiy
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd May 4, 2012
+.Dt SUPERVISE 8
+.Os
+.Sh NAME
+.Nm rcguard
+.Nd "guard an rc.d service process from failure"
+.Sh SYNOPSIS
+.Nm
+.Op Fl fv
+.Fl p Ar pidfile
+.Op Fl s Ar sig_stop
+.Op Fl t Ar timeout
+.Ar service
+.Ar command
+.Sh DESCRIPTION
+The
+.Nm
+auxiliary daemon can monitor an individual
+.Pa rc.d
+service process such as a daemon and restart it, should it crash.
+The
+.Nm
+daemon is not to be started by the user:
+It is intended for the internal use by the
+.Pa rc.d
+system.
+It can be enabled on an individual service by setting its
+respective
+.Va ${name}_guard
+variable in
+.Xr rc.conf 5 ,
+which is an
+.Xr rc.subr 8
+feature.
+.Pp
+The options and arguments are as follows:
+.Bl -tag -width ".Fl s Ar sig_stop"
+.It Fl f
+Run in foreground instead of daemonizing itself.
+Useful in combination with a
+.Fl v
+option.
+.It Fl p Ar pidfile
+The PID file of the process to monitor.
+This is a mandatory option because
+.Nm
+needs to learn the PID of the watched process.
+Own PID file name of the
+.Nm
+instance is formed by appending an
+.Dq Li rcguard
+suffix to
+.Ar pidfile .
+.It Fl s Ar sig_stop
+The signal used to shut the monitored process down,
+corresponding to the
+.Va sig_stop
+variable set by the
+.Pa rc.d
+script.
+If the process terminates on this signal, it will not be restarted.
+In the absence of any
+.Fl s
+option, termination on any signal will be interpreted as crash and
+the service will be restarted.
+.It Fl t Ar timeout
+The timeout to wait for the service PID file as specified with a
+.Fl p
+option.
+Since there is no direct synchronization between
+.Nm
+and the service it monitors, the service can fully start and create
+its PID file well after
+.Nm
+was started.
+For this reason,
+.Nm
+will wait at least
+.Ar timeout
+seconds for a valid PID file to appear.
+The default timeout is 60 seconds.
+.It Fl v
+Increase verbosity level.
+Currently two levels above normal, non-verbose, mode are supported.
+Useful in combination with a
+.Fl f
+option.
+.It Ar service
+The name of the service to monitor or the absolute path name
+of its
+.Pa rc.d
+script .
+If
+.Ar service
+is specified by its name,
+.Xr service 8
+will be used to control it.
+Note well that due to a
+.Xr service 8
+limitation,
+.Ar service
+currently has to be the name of the
+.Pa rc.d
+script itself rather than the internal name set in the
+.Va name
+variable.
+At the same time, specifying
+.Ar service
+by the absolute path name of its
+.Pa rc.d
+script is unambiguous and free of that caveat.
+.It Ar command
+The
+.Pa rc.d
+command to be used to restart the service in case it fails,
+such as
+.Dq Li restart
+or
+.Dq Li onestart .
+.El
+.Pp
+By default, if the monitored process terminated on a signal,
+it is assumed to have crashed and will be restarted, but if
+it exited, it is assumed to have shut down cleanly and will
+not be restarted.
+This behavior can be adjusted with a
+.Fl s
+option to specify a single signal also meaning a clean shutdown.
+This way the monitored service can be stopped by
+.Pa rc.d
+without having to communicate with its instance of
+.Nm .
+.Sh SEE ALSO
+.Xr rc.conf 5 ,
+.Xr rc 8 ,
+.Xr rc.subr 8 ,
+.Xr service 8
+.Sh HISTORY
+The
+.Nm
+manual page first appeared in
+.Fx 9.0 .
+.Sh AUTHORS
+This
+manual page was written by
+.An Yar Tikhiy Aq yar at FreeBSD.org .
Added: user/yar/rcguard/rcguard.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/yar/rcguard/rcguard.c Fri May 4 01:35:13 2012 (r234991)
@@ -0,0 +1,395 @@
+#include <sys/param.h>
+#include <sys/event.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <err.h>
+#include <errno.h>
+#include <libutil.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <sysexits.h>
+#include <syslog.h>
+#include <unistd.h>
+
+/*
+ * Assumptions made and corners cut:
+ *
+ * XXX rc.d script name == $name set in it
+ *
+ * This is mostly true except in several historical cases.
+ * One big exception is sendmail. It effectively handles
+ * several services with different names. Ideally, those
+ * should have separate rc.d scripts.
+ * To work around exceptions, this utility accepts absolute
+ * paths to rc.d scripts as well. Now it's preferred way
+ * to invoke it from rc.subr.
+ *
+ * there is no stale pidfile left from an earlier instance
+ * of the service with a pid value now reused by an unrelated
+ * process
+ *
+ * This is an obvious race condition: Should a stale pid
+ * file exist, it will be impossible to reliably tell if
+ * it came from the current or previous instance of the
+ * service. Hence the assumption. Of course, it would
+ * be better just to remove any stale pidfiles in rc.subr
+ * before starting the service.
+ *
+ * pid value is written atomically
+ *
+ * E.g., there should be no chance to read just "12" from
+ * the pidfile if the pid value is 12345.
+ *
+ * no pidfile or other lock mechanism used here -- relying
+ * on the monitored process pidfile checked by rc.d
+ *
+ * rc.d won't try to start a service if it's already running.
+ */
+
+#define MY_NAME "rcguard"
+#define PIDFILE_SUFFIX MY_NAME
+
+#define PATH_SERVICE "/usr/sbin/service"
+
+int foreground = 0;
+struct pidfh *pfh = NULL;
+long pidfile_timeout = 60; /* seconds */
+const char *service_command;
+const char *service_name;
+const char *service_pidfile = NULL;
+int sig_stop = -1; /* no signal means clean exit by default */
+int verbose = 0;
+
+void cleanup(void);
+pid_t get_pid_from_file(const char *, long);
+void usage(void);
+int str2sig(const char *);
+int watch_pid(pid_t);
+
+int
+main(int argc, char **argv)
+{
+ char *ep;
+ char *mypidfile;
+ const char *shortname;
+ int c;
+ int restart;
+ pid_t pid;
+
+ atexit(cleanup);
+
+ while ((c = getopt(argc, argv, "fp:s:t:v")) != -1) {
+ switch (c) {
+ case 'f':
+ foreground = 1;
+ break;
+ case 'p':
+ service_pidfile = optarg;
+ if (service_pidfile[0] == '\0')
+ errx(EX_USAGE, "null pidfile name");
+ break;
+ case 's':
+ if (optarg[0] == '\0')
+ errx(EX_USAGE, "null signal name");
+ if ((sig_stop = str2sig(optarg)) == -1)
+ errx(EX_USAGE,
+ "invalid signal name %s", optarg);
+ break;
+ case 't':
+ pidfile_timeout = strtol(optarg, &ep, 10);
+ if (pidfile_timeout <= 0 || *ep != '\0')
+ errx(EX_USAGE,
+ "invalid timeout value: %s", optarg);
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ /* Can't monitor a service w/o knowing its pidfile */
+ if (service_pidfile == NULL)
+ usage();
+
+ if (argc != 2)
+ usage();
+ service_name = argv[0];
+ service_command = argv[1];
+ if (service_name[0] == '\0')
+ errx(EX_USAGE, "null service name");
+ if (service_command[0] == '\0')
+ errx(EX_USAGE, "null service command");
+
+ /* Get basename for a nicer proctitle and messages */
+ shortname = strrchr(service_name, '/');
+ if (shortname == NULL || *(++shortname) == '\0')
+ shortname = service_name;
+
+ if (verbose) {
+ printf("Service: %s\n", service_name);
+ printf("Command: %s\n", service_command);
+ printf("Pidfile: %s\n", service_pidfile);
+ printf("Signal: %d\n", sig_stop);
+ printf("Timeout: %ld\n", pidfile_timeout);
+ }
+
+ asprintf(&mypidfile, "%s.%s", service_pidfile, PIDFILE_SUFFIX);
+ if (mypidfile == NULL)
+ errx(EX_UNAVAILABLE, "out of memory in asprintf");
+ if ((pfh = pidfile_open(mypidfile, 0644, &pid)) == NULL) {
+ if (errno == EEXIST)
+ errx(EX_UNAVAILABLE,
+ "already monitoring %s with pid %ld",
+ shortname, (long)pid);
+ else
+ err(EX_CANTCREAT, "failed to create own pidfile %s",
+ mypidfile);
+ }
+
+ /*
+ * It's OK to (re)write pidfile more than once, so do it here
+ * before a possibly long wait for the service pidfile
+ * to make our pid known for troubleshooting purposes.
+ */
+ if (pidfile_write(pfh) == -1)
+ errx(EX_UNAVAILABLE, "failed to write to own pidfile %s",
+ mypidfile);
+
+ pid = get_pid_from_file(service_pidfile, pidfile_timeout);
+
+ if (!foreground) {
+ if (verbose)
+ printf("Daemonizing; further messages in syslog\n");
+ verbose = 0; /* no stdio after daemon() */
+ if (daemon(0, 0) == -1)
+ err(EX_OSERR, "Failed to daemonize");
+ }
+
+ openlog(MY_NAME, LOG_CONS | LOG_PID, LOG_DAEMON);
+
+ /* Now that we've daemonized, rewrite our pidfile with the new pid. */
+ if (pidfile_write(pfh) == -1)
+ syslog(LOG_ERR, "failed to write to own pidfile %s",
+ mypidfile);
+
+ setproctitle("%s", shortname);
+
+ c = watch_pid(pid);
+ if (WIFSIGNALED(c)) {
+ syslog(LOG_NOTICE, "%s terminated on signal %d",
+ shortname, WTERMSIG(c));
+ restart = WTERMSIG(c) != sig_stop;
+ } else if (WIFEXITED(c)) {
+ syslog(LOG_NOTICE, "%s exited with status %d",
+ shortname, WEXITSTATUS(c));
+ restart = 0;
+ } else {
+ syslog(LOG_WARNING, "%s ceased with unknown status %d",
+ shortname, c);
+ restart = 1;
+ }
+
+ if (restart) {
+ syslog(LOG_NOTICE, "Restarting %s", shortname);
+ if (verbose)
+ printf("Restarting %s\n", service_name);
+
+ /*
+ * Although pidfile_open() sets O_CLOEXEC and so it could
+ * be OK to leave our pidfile open here, the exec'ed script
+ * will inherit our pid easily confusing programs that don't
+ * try to lock the pidfile and only check pid existence.
+ *
+ * No race condition created here as the new instance(s)
+ * will be locking the pidfile anyway.
+ */
+ pidfile_remove(pfh);
+ pfh = NULL; /* for cleanup() */
+
+ if (service_name[0] == '/') {
+ if (verbose)
+ printf("Running '%s %s'\n",
+ service_name, service_command);
+ c = execl(service_name, service_name, service_command,
+ (char *)NULL);
+ } else {
+ if (verbose)
+ printf("Running '%s %s %s'\n",
+ PATH_SERVICE, service_name,
+ service_command);
+ c = execl(PATH_SERVICE, PATH_SERVICE,
+ service_name, service_command, (char *)NULL);
+ }
+ if (c == -1)
+ syslog(LOG_ERR, "exec failed: %m");
+ else
+ syslog(LOG_ERR, "exec returned %d", c);
+ exit(EX_OSERR);
+ } else
+ syslog(LOG_NOTICE, "%s stopped", shortname);
+
+ exit(EX_OK);
+
+ return (0); /* dummy */
+}
+
+void
+cleanup(void)
+{
+
+ if (pfh) {
+ pidfile_remove(pfh);
+ pfh = NULL; /* in case there is another atexit() handler */
+ }
+}
+
+pid_t
+get_pid_from_file(const char *pidfile, long timeout)
+{
+ char buf[32];
+ char *ep;
+ FILE *fp;
+ long pid; /* will be cast to pid_t on return */
+ long slept;
+ long t;
+ struct stat st;
+
+ for (pid = slept = 0;;) {
+ if ((fp = fopen(pidfile, "r")) == NULL) {
+ if (verbose)
+ printf("Failed to open %s: %s\n",
+ pidfile, strerror(errno));
+ goto retry; /* Not created yet? */
+ }
+ if (fgets(buf, sizeof(buf), fp) == NULL) {
+ if (verbose)
+ printf("Read nothing from %s\n", pidfile);
+ fclose(fp);
+ goto retry; /* Not written yet? */
+ }
+ if (verbose > 1)
+ printf("Got 1st line from pidfile %s:\n%s\n",
+ pidfile, buf);
+ pid = strtol(buf, &ep, 10);
+ if (pid <= 0 || !(*ep == '\0' || *ep == '\n' ||
+ *ep == '\t' || *ep == ' '))
+ errx(EX_DATAERR,
+ "no pid in pidfile %s", pidfile);
+ if (verbose)
+ printf("Got pid %ld from %s\n", pid, pidfile);
+ if (fstat(fileno(fp), &st) != 0) {
+ if (verbose)
+ printf("Failed to stat %s: %s\n",
+ pidfile, strerror(errno));
+ fclose(fp);
+ goto retry; /* File system gone? */
+ }
+ fclose(fp);
+ if (kill(pid, 0) != 0) {
+ if (errno != ESRCH)
+ err(EX_NOPERM, "failed to check pid %ld", pid);
+ if (verbose)
+ printf("No process with pid %ld yet\n", pid);
+ goto retry; /* Stale pidfile? */
+ }
+ t = time(NULL) - st.st_mtime;
+ if (t >= timeout)
+ warnx("pidfile %s might be stale, age %ld seconds",
+ pidfile, t);
+ break;
+retry:
+ if (slept >= timeout)
+ errx(EX_UNAVAILABLE,
+ "timeout waiting for pidfile %s", pidfile);
+ /* Exponential backoff */
+ t = slept ? slept : 1;
+ if (verbose)
+ printf("Sleeping for %ld seconds...\n", t);
+ sleep(t);
+ slept += t;
+ if (verbose > 1)
+ printf("Slept for %ld seconds so far\n", slept);
+ if (verbose)
+ printf("Retrying...\n");
+ }
+
+ return (pid);
+}
+
+int
+str2sig(const char *s)
+{
+ char *ep;
+ int i;
+
+ /* First, check if it's numeric */
+ i = (int)strtol(s, &ep, 10);
+ if (i > 0 && *ep == '\0')
+ return (i);
+
+ /* Drop SIG prefix if present */
+ if (strncmp(s, "SIG", 3) == 0 && strlen(s) > 3)
+ s += 3;
+
+ /* Search the table of signal names */
+ for (i = 1; i < NSIG; i++) {
+ if (strcmp(s, sys_signame[i]) == 0)
+ return (i);
+ }
+
+ return (-1);
+}
+
+void
+usage(void)
+{
+ fprintf(stderr,
+ "Usage: %s [-fv] [-s sig_stop] [-t timeout] " \
+ "-p pidfile service command\n", MY_NAME);
+ exit(EX_USAGE);
+}
+
+int
+watch_pid(pid_t pid)
+{
+ int kq;
+ struct kevent kev;
+
+ if ((kq = kqueue()) == -1) {
+ syslog(LOG_ERR, "kqueue: %m");
+ exit(EX_OSERR);
+ }
+
+ EV_SET(&kev, pid, EVFILT_PROC, EV_ADD | EV_ONESHOT, NOTE_EXIT, 0, NULL);
+
+ if (verbose)
+ printf("Waiting for kevent on pid %ld...\n", (long)pid);
+
+ switch (kevent(kq, &kev, 1, &kev, 1, NULL)) {
+ case -1:
+ syslog(LOG_ERR, "kevent: %m");
+ exit(EX_OSERR);
+ case 0:
+ syslog(LOG_ERR, "kevent returned 0");
+ exit(EX_OSERR);
+ }
+
+ if ((long)kev.ident != (long)pid) {
+ syslog(LOG_ERR, "kevent fired on pid %ld not %ld",
+ (long)kev.ident, (long)pid);
+ exit(EX_OSERR);
+ }
+
+ if (verbose)
+ printf("Got exit status %d\n", (int)kev.data);
+
+ return (kev.data);
+}
More information about the svn-src-user
mailing list