svn commit: r227668 - user/sbruno/ard

Sean Bruno sbruno at FreeBSD.org
Fri Nov 18 17:37:05 UTC 2011


Author: sbruno
Date: Fri Nov 18 17:37:04 2011
New Revision: 227668
URL: http://svn.freebsd.org/changeset/base/227668

Log:
  Add the Yahoo! ata-raid monitoring daemon for review.
  
  Compiles against amd64 freebsd-current at this time.
  
  Installs a /usr/sbin/ard and cam be started/stopped via the included
  rc script.
  
  Obtained from:	Yahoo! Inc. and jhb@ in a former life

Added:
  user/sbruno/ard/
  user/sbruno/ard/Makefile
  user/sbruno/ard/ard.c
  user/sbruno/ard/ard.rc   (contents, props changed)

Added: user/sbruno/ard/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/sbruno/ard/Makefile	Fri Nov 18 17:37:04 2011	(r227668)
@@ -0,0 +1,9 @@
+PROG=	ard
+BINDIR=	/usr/sbin
+
+CFLAGS+= -g -Wall -Wunused
+
+NOMAN=
+NO_MAN=
+
+.include <bsd.prog.mk>

Added: user/sbruno/ard/ard.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/sbruno/ard/ard.c	Fri Nov 18 17:37:04 2011	(r227668)
@@ -0,0 +1,441 @@
+/*-
+ * Copyright (c) 2011 Yahoo! Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/ata.h>
+#include <sys/errno.h>
+#include <sys/ioctl.h>
+#include <sys/param.h>
+#if __FreeBSD_version >= 600000
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <sys/uio.h>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <err.h>
+#include <fcntl.h>
+
+#define	MAX_UNIT	16
+
+#define VOLUME_DEGRADED(status)		((status) != AR_READY)
+
+#if __FreeBSD_version < 600000
+#define	ata_ioc_raid_status	raid_status
+#endif
+	
+struct ata_volume {
+	struct ata_ioc_raid_status status;
+	int	present;
+	int	generation;
+	int	prev_status;
+	int	unit;
+	int	sentcnt;
+	int	missing_drives;
+	int	disks[16];
+};
+
+static struct ata_volume volumes[MAX_UNIT];
+static int fd, generation;
+
+static char hostname[MAXHOSTNAMELEN];
+static char *mailto = "root at localhost";
+static int notifyminutes = 720;         /* send mail every 12 hours by default */
+static int dostdout;
+
+static int
+ataraid_fetch_status(int unit, struct ata_ioc_raid_status *status)
+{
+#if __FreeBSD_version < 600000
+	struct ata_cmd cmd;
+	int retval;
+
+	bzero(&cmd, sizeof(cmd));
+	cmd.channel = unit;
+	cmd.cmd = ATARAIDSTATUS;
+	retval = ioctl(fd, IOCATA, &cmd);
+	if (retval >= 0)
+		*status = cmd.u.raid_status;
+	return (retval);
+#else
+	status->lun = unit;
+	return (ioctl(fd, IOCATARAIDSTATUS, status));
+#endif
+}
+
+static void
+ataraid_scan_volume(int unit)
+{
+	struct ata_volume *vol;
+	int i, status;
+
+	vol = &volumes[unit];
+	status = vol->status.status;
+	if (ataraid_fetch_status(unit, &vol->status) < 0) {
+		vol->present = 0;
+		return;
+	}
+	vol->generation = generation;
+	vol->missing_drives = 0;
+	for (i = 0; i < vol->status.total_disks; i++)
+		if (vol->status.disks[i].lun < 0)
+			vol->missing_drives++;
+
+	/* New volume arrived. */
+	if (!vol->present) {
+		vol->present = 1;
+		vol->prev_status = AR_READY;
+		for (i = 0; i < 16; i++)
+			vol->disks[i] = vol->status.disks[i].lun;
+		return;
+	}
+
+	/* See if any of the present disks differ. */
+	for (i = 0; i < vol->status.total_disks; i++) {
+		if (vol->status.disks[i].lun < 0 || vol->disks[i] < 0)
+			continue;
+		if (vol->status.disks[i].lun != vol->disks[i]) {
+			/* Treat it as a new volume. */
+			vol->prev_status = AR_READY;
+			for (i = 0; i < 16; i++)
+				vol->disks[i] = vol->status.disks[i].lun;
+			return;
+		}
+	}
+
+	/*
+         * Copy over disks but don't replace a valid disk number with
+         * a missing disk so we remember what disk is missing.
+	 */
+	vol->prev_status = status;
+	for (i = 0; i < vol->status.total_disks; i++) {
+		if (vol->status.disks[i].lun < 0)
+			continue;
+		vol->disks[i] = vol->status.disks[i].lun;
+	}
+}
+
+static void
+ataraid_scan_all(void)
+{
+	int i;
+
+	generation++;
+	for (i = 0; i < MAX_UNIT; i++)
+		ataraid_scan_volume(i);
+}
+
+static void
+ataraid_rebuild(struct ata_volume *vol)
+{
+#if __FreeBSD_version < 600000
+	struct ata_cmd cmd;
+#else
+	char buf[32], title[32];
+#endif
+	int i, failed, spares;
+
+	/* Make sure we have enough spares before trying a rebuild. */
+	failed = 0;
+	spares = 0;
+	for (i = 0; i < vol->status.total_disks; i++) {
+		if (vol->status.disks[i].state & AR_DISK_ONLINE)
+			continue;
+		if (vol->status.disks[i].state & AR_DISK_SPARE)
+			spares++;
+		else
+			failed++;
+	}
+	if (dostdout)
+		printf("found %d failed drives and %d spares for ar%d\n",
+		    failed, spares, vol->unit);
+	if (spares < failed)
+		return;
+
+	switch (fork()) {
+	case 0:
+		/* Child process does the actual rebuild. */
+		setproctitle("rebuilding ar%d", vol->unit);
+		if (dostdout)
+			printf("%d: initiating rebuild for ar%d\n", getpid(),
+			    vol->unit);
+#if __FreeBSD_version < 600000
+		cmd.channel = vol->unit;
+		cmd.cmd = ATARAIDREBUILD;
+		ioctl(fd, IOCATA, &cmd);
+#else
+		if (ioctl(fd, IOCATARAIDREBUILD, &vol->unit) >= 0) {
+			setpriority(PRIO_PROCESS, 0, 20);
+			snprintf(title, sizeof(title), "dd: rebuilding ar%d",
+			    vol->unit);
+			snprintf(buf, sizeof(buf), "if=/dev/ar%d", vol->unit);
+			execl("/bin/dd", title, buf, "of=/dev/null", "bs=1m",
+			    NULL);
+		}
+#endif
+		exit(0);
+	default:
+		break;
+	}
+}
+
+static int
+ataraid_open(void)
+{
+	int i, nvolumes;
+
+	fd = open("/dev/ata", O_RDWR);
+	if (fd < 0)
+		return (0);
+	generation++;
+	nvolumes = 0;
+	for (i = 0; i < MAX_UNIT; i++) {
+		volumes[i].unit = i;
+		if (ataraid_fetch_status(i, &volumes[i].status) < 0)
+			continue;
+		nvolumes++;
+	}
+
+	return (nvolumes);
+}
+
+static FILE *
+mailer_open(void)
+{
+	FILE *fp;
+
+	if (dostdout)
+		fp = stdout;
+	else
+		fp = popen("/usr/sbin/sendmail -t", "w");
+	fprintf(fp, "To: %s\n", mailto);
+	return fp;
+}
+
+static void
+mailer_close(FILE *fp)
+{
+
+	if (dostdout == 0)
+		pclose(fp);
+	else
+		fflush(fp);
+}
+
+static void
+mailer_write(FILE *fp, const char *fmt, ...)
+{
+	va_list ap;
+	char *mfmt, *pfmt = NULL;
+
+	pfmt = mfmt = strdup(fmt);
+
+	va_start (ap, fmt);
+	vfprintf (fp, fmt, ap);
+	va_end (ap);
+
+	/* XXX: Hack for Subject: */
+	if (strncmp(fmt, "Subject: ", 9) == 0) {
+		char *p;
+		pfmt += strlen("Subject: ");
+		if ((p = strchr(pfmt, '\n')) != NULL)
+			*p = '\0';
+	}
+
+	if (dostdout == 0) {
+		va_start (ap, fmt);
+		vsyslog(LOG_CRIT, pfmt, ap);
+		va_end (ap);
+	}
+
+	if (mfmt)
+		free(mfmt);
+}
+
+char *
+ataraid_show_failed(struct ata_volume *vol)
+{
+	char *str, *p;
+	int i, comma = 0, failed;
+
+	failed = 0;
+	for (i = 0; i < vol->status.total_disks; i++)
+		if ((vol->status.disks[i].state & (AR_DISK_ONLINE |
+		    AR_DISK_SPARE)) == 0 && vol->disks[i] >= 0)
+			failed++;
+	if (failed == 0)
+		return (NULL);
+
+	str = calloc(failed * 8, sizeof(char));
+	if (str == NULL)
+		return (NULL);
+
+	p = str;
+	*p++ = '(';
+	for (i = 0; i < vol->status.total_disks; i++) {
+		if ((vol->status.disks[i].state & (AR_DISK_ONLINE |
+		    AR_DISK_SPARE)) != 0)
+			continue;
+		if (vol->disks[i] >= 0) {
+			if (comma++)
+				*p++ = ',';
+			p += sprintf(p, "ad%d", vol->disks[i]);
+		}
+	}
+	if ((p - str) == 1) {
+		p += sprintf(p, "none");
+	}
+	*p = ')';
+
+	return (str);
+}
+
+static void
+ataraid_notify_failure(struct ata_volume *vol)
+{
+	FILE *fp;
+	int *sentcnt;
+	char *failed;
+
+	sentcnt = &vol->sentcnt;
+	if (vol->status.status == vol->prev_status &&
+	    ((*sentcnt)++ % notifyminutes) != 0)
+		return;
+	*sentcnt = 1;
+
+	failed = ataraid_show_failed(vol);
+
+	fp = mailer_open();
+	mailer_write(fp, "Subject: [ATA-RAID ALERT] vol ar%d on %s\n\n",
+	    vol->unit, hostname);
+	if (!VOLUME_DEGRADED(vol->status.status)) {
+		mailer_write(fp,
+		    "%s: volume ar%d is rebuilt and no longer has errors\n",
+		    hostname, vol->unit);
+	} else {
+		if (vol->status.status ==
+		    (AR_READY | AR_DEGRADED | AR_REBUILDING))
+			mailer_write(fp,
+			    "%s: rebuilding volume ar%d: %d%% completed\n",
+			    hostname, vol->unit, vol->status.progress);
+		else if (vol->status.status != (AR_READY | AR_DEGRADED))
+			mailer_write(fp, "%s: volume ar%d is lost\n", hostname,
+			    vol->unit);
+		if (failed)
+			mailer_write(fp,
+		    "%s: disk(s) on volume ar%d need to be replaced: %s\n",
+			    hostname, vol->unit, failed);
+		else if (vol->missing_drives)
+			mailer_write(fp,
+		    "%s: %d disk(s) on volume ar%d need to be replaced\n",
+			    hostname, vol->missing_drives, vol->unit);
+		else if (vol->status.status == (AR_READY | AR_DEGRADED))
+			mailer_write(fp, "%s: volume ar%d is degraded\n",
+			    hostname, vol->unit);
+	}
+
+	if (failed)
+		free(failed);
+
+	mailer_close(fp);
+}
+
+static void
+ataraid_check_volumes(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_UNIT; i++) {
+		if (!volumes[i].present)
+			continue;
+		if (volumes[i].status.status == (AR_READY | AR_DEGRADED))
+			ataraid_rebuild(&volumes[i]);
+		if (VOLUME_DEGRADED(volumes[i].status.status) ||
+		    VOLUME_DEGRADED(volumes[i].prev_status))
+			ataraid_notify_failure(&volumes[i]);
+	}
+}
+
+static void
+usage(void)
+{
+	fprintf(stderr, "usage: ard [-ds] [-t minutes] [mailto]\n");
+	exit(1);
+}
+
+int
+main(int ac, char *av[])
+{
+	int			ch, daemonize = 1;
+
+	while ((ch = getopt(ac, av, "dst:")) != -1) {
+		switch (ch) {
+		case 'd':
+			daemonize = 0;
+			break;
+
+		case 't':
+			notifyminutes = atoi(optarg);
+			break;
+
+		case 's':
+			dostdout = 1;
+			break;
+		case '?':
+			usage();
+		}
+	}
+
+	av += optind;
+	ac -= optind;
+
+	if (ac > 1)
+		usage();
+	if (ac == 1)
+		mailto = av[0];
+
+	gethostname(hostname, sizeof(hostname));
+
+	if (daemonize) {
+		if (daemon(0, 0) < 0)
+			err(1, "daemon");
+	}
+
+	if (ataraid_open() == 0)
+		return (0);
+
+	for (;;) {
+		ataraid_scan_all();
+		ataraid_check_volumes();
+		sleep(60);
+	}
+}

Added: user/sbruno/ard/ard.rc
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/sbruno/ard/ard.rc	Fri Nov 18 17:37:04 2011	(r227668)
@@ -0,0 +1,54 @@
+#!/bin/sh
+/*-
+ * Copyright (c) 2011 Yahoo! Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+# PROVIDE: ard
+# REQUIRE: dumpon root ldconfig devfs syslogd mail
+# KEYWORD: nojail noyroot
+
+. /etc/rc.subr
+
+name=ard
+raid_alert_mailto=${raid_alert_mailto:-"root at localhost"}
+start_cmd="ard_start"
+stop_cmd="ard_stop"
+
+ard_start()
+{
+	if [ -x "/usr/sbin/ard" -a -e "/dev/ata" ]; then
+	        echo "Starting ard."
+		/usr/sbinard ${raid_alert_mailto}
+	fi
+}
+
+ard_stop()
+{
+	killall -9 ard > /dev/null 2>&1
+}
+
+load_rc_config $name
+run_rc_command "$1"


More information about the svn-src-user mailing list