svn commit: r188025 - in projects/geom_raid5: sbin/geom/class/raid5 sys/geom/raid5 sys/modules/geom/geom_raid5

Ulf Lilleengen lulf at FreeBSD.org
Mon Feb 2 12:51:30 PST 2009


Author: lulf
Date: Mon Feb  2 20:51:26 2009
New Revision: 188025
URL: http://svn.freebsd.org/changeset/base/188025

Log:
  - Import the geom_raid5 PP sources with a few modifications such as adding
    macros for metadata offsets as well as converting kthread to kproc.

Added:
  projects/geom_raid5/sbin/geom/class/raid5/
  projects/geom_raid5/sbin/geom/class/raid5/Makefile   (contents, props changed)
  projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c   (contents, props changed)
  projects/geom_raid5/sbin/geom/class/raid5/graid5.8
  projects/geom_raid5/sys/geom/raid5/
  projects/geom_raid5/sys/geom/raid5/g_raid5.c   (contents, props changed)
  projects/geom_raid5/sys/geom/raid5/g_raid5.h   (contents, props changed)
  projects/geom_raid5/sys/modules/geom/geom_raid5/
  projects/geom_raid5/sys/modules/geom/geom_raid5/Makefile   (contents, props changed)

Added: projects/geom_raid5/sbin/geom/class/raid5/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/geom_raid5/sbin/geom/class/raid5/Makefile	Mon Feb  2 20:51:26 2009	(r188025)
@@ -0,0 +1,7 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../misc
+
+CLASS=	raid5
+
+.include <bsd.lib.mk>

Added: projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c	Mon Feb  2 20:51:26 2009	(r188025)
@@ -0,0 +1,325 @@
+/*-
+ * Copyright (c) 2006 Arne Woerner <arne_woerner at yahoo.com>
+ * testing + tuning-tricks: veronica at fluffles.net
+ * derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd at FreeBSD.org>)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$Id: geom_raid5.c,v 1.33.1.12 2007/11/12 20:24:45 aw Exp aw $");
+
+#include <sys/param.h>
+#include <errno.h>
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <libgeom.h>
+#include <geom/raid5/g_raid5.h>
+
+#include "core/geom.h"
+#include "misc/subr.h"
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_RAID5_VERSION;
+static intmax_t default_stripesize = 64*1024;
+
+static void raid5_main(struct gctl_req *req, unsigned flags);
+static void raid5_clear(struct gctl_req *req);
+static void raid5_dump(struct gctl_req *req);
+static void raid5_label(struct gctl_req *req);
+
+#ifndef G_TYPE_BOOL
+#define G_TYPE_BOOL G_TYPE_NONE
+#endif
+
+#if __FreeBSD_version >= 700000
+#define GCMD67 NULL,
+#else
+#define GCMD67 
+#endif
+struct g_command class_commands[] = {
+	{ "clear", G_FLAG_VERBOSE, raid5_main, G_NULL_OPTS, GCMD67
+	    "[-v] prov ..."
+	},
+	{ "destroy", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'y', "noyoyo", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    }, GCMD67
+	    "[-fvy] name ..."
+	},
+	{ "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, GCMD67
+	    "[-v] name prov"
+	},
+	{ "insert", G_FLAG_VERBOSE, NULL,
+		{	{ 'h', "hardcode", NULL, G_TYPE_BOOL },
+			G_OPT_SENTINEL}, GCMD67
+	    "[-hv] name prov"
+	},
+	{ "configure", G_FLAG_VERBOSE, NULL,
+		{	{ 'h', "hardcode", NULL, G_TYPE_BOOL },
+			{ 'a', "activate", NULL, G_TYPE_BOOL },
+			{ 'c', "cowop", NULL, G_TYPE_BOOL },
+			{ 'n', "nohot", NULL, G_TYPE_BOOL },
+			{ 'S', "safeop", NULL, G_TYPE_BOOL },
+			{ 'R', "rebuild", NULL, G_TYPE_BOOL },
+			G_OPT_SENTINEL}, GCMD67
+	    "[-RSchnva] name"
+	},
+	{ "dump", 0, raid5_main, G_NULL_OPTS, GCMD67
+	    "prov ..."
+	},
+	{ "label", G_FLAG_VERBOSE | G_FLAG_LOADKLD, raid5_main,
+		{	{ 'c', "cowop", NULL, G_TYPE_BOOL },
+			{ 'h', "hardcode", NULL, G_TYPE_BOOL },
+			{ 'n', "nohot", NULL, G_TYPE_BOOL },
+			{ 's', "stripesize", &default_stripesize, G_TYPE_NUMBER },
+			{ 'S', "safeop", NULL, G_TYPE_BOOL },
+			G_OPT_SENTINEL}, GCMD67
+	    "[-chvn] [-s stripesize] [-S] name prov ..."
+	},
+	{ "stop", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'y', "noyoyo", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    }, GCMD67
+	    "[-fv] name ..."
+	},
+	G_CMD_SENTINEL
+};
+
+static int verbose = 0;
+
+static void
+raid5_main(struct gctl_req *req, unsigned flags)
+{
+	const char *name;
+
+	if ((flags & G_FLAG_VERBOSE) != 0)
+		verbose = 1;
+
+	name = gctl_get_ascii(req, "verb");
+	if (name == NULL) {
+		gctl_error(req, "No '%s' argument.", "verb");
+		return;
+	}
+	if (strcmp(name, "label") == 0)
+		raid5_label(req);
+	else if (strcmp(name, "clear") == 0)
+		raid5_clear(req);
+	else if (strcmp(name, "dump") == 0)
+		raid5_dump(req);
+	else
+		gctl_error(req, "Unknown command: %s.", name);
+}
+
+static void
+raid5_label(struct gctl_req *req)
+{
+	struct g_raid5_metadata md;
+	const char *name;
+	int error, i, hardcode, nargs, safeop, nohot, cowop;
+	intmax_t stripesize;
+
+	nargs = gctl_get_int(req, "nargs");
+	if (nargs < 3) {
+		gctl_error(req, "Too few arguments.");
+		return;
+	}
+	nohot = gctl_get_int(req, "nohot");
+	hardcode = gctl_get_int(req, "hardcode");
+	safeop = gctl_get_int(req, "safeop");
+	cowop = gctl_get_int(req, "cowop");
+	stripesize = gctl_get_intmax(req, "stripesize");
+	if (stripesize > 256*1024) {
+		gctl_error(req, "stripesize must be less than 512KB.");
+		return;
+	}
+	if (!powerof2(stripesize)) {
+		int cs;
+		for (cs=4096; cs < stripesize; cs<<=1);
+		gctl_error(req, "Invalid stripe size: %jd, recommended: %d.",
+		           stripesize, cs);
+		return;
+	}
+
+
+	/*
+	 * Clear last sector first to spoil all components if device exists.
+	 */
+	for (i = 1; i < nargs; i++) {
+		name = gctl_get_ascii(req, "arg%d", i);
+		error = g_metadata_clear(name, NULL);
+		if (error != 0) {
+			gctl_error(req, "Can't store metadata on %s: %s.", name,
+			    strerror(error));
+			return;
+		}
+	}
+
+	strlcpy(md.md_magic, G_RAID5_MAGIC, sizeof(md.md_magic));
+	md.md_version = G_RAID5_VERSION;
+	name = gctl_get_ascii(req, "arg0");
+	strlcpy(md.md_name, name, sizeof(md.md_name));
+	md.md_id = arc4random();
+	md.md_all = nargs - 1;
+	md.md_stripesize = stripesize;
+	md.md_verified = 0;
+	md.md_newest = -1;
+	md.md_no_hot = nohot;
+	md.md_state = nohot ? G_RAID5_STATE_CALM :
+	                      (G_RAID5_STATE_HOT|G_RAID5_STATE_VERIFY);
+	if (safeop)
+		md.md_state |= G_RAID5_STATE_SAFEOP;
+	if (cowop)
+		md.md_state |= G_RAID5_STATE_COWOP;
+
+	/*
+	 * Ok, store metadata.
+	 */
+	int64_t min = -1;
+	int64_t waste = 0;
+	for (i = 1; i < nargs; i++) {
+		u_char sector[512];
+		int64_t pmin;
+
+		name = gctl_get_ascii(req, "arg%d", i);
+		md.md_no = i - 1;
+		if (!hardcode)
+			bzero(md.md_provider, sizeof(md.md_provider));
+		else {
+			if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0)
+				name += strlen(_PATH_DEV);
+			strlcpy(md.md_provider, name, sizeof(md.md_provider));
+		}
+		md.md_provsize = g_get_mediasize(name);
+		pmin = md.md_provsize - g_get_sectorsize(name);
+		waste += pmin % stripesize;
+		if (min < 0)
+			min = pmin;
+		else if (min > pmin) {
+			waste += (i-1) * (min - pmin);
+			min = pmin;
+		} else
+			waste += pmin - min;
+		if (md.md_provsize == 0) {
+			fprintf(stderr, "Can't get mediasize of %s: %s.\n",
+			    name, strerror(errno));
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		raid5_metadata_encode(&md, sector);
+		error = g_metadata_store(name, sector, sizeof(sector));
+		if (error != 0) {
+			fprintf(stderr, "Can't store metadata on %s: %s.\n",
+			    name, strerror(error));
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		if (verbose)
+			printf("Metadata value stored on %s.\n", name);
+	}
+	if (waste > 0)
+		printf("Wasting %jd bytes (>=%jdGB).\n", waste, waste>>(3*10));
+}
+
+static void
+raid5_clear(struct gctl_req *req)
+{
+	const char *name;
+	int error, i, nargs;
+
+	nargs = gctl_get_int(req, "nargs");
+	if (nargs < 1) {
+		gctl_error(req, "Too few arguments.");
+		return;
+	}
+
+	for (i = 0; i < nargs; i++) {
+		name = gctl_get_ascii(req, "arg%d", i);
+		error = g_metadata_clear(name, G_RAID5_MAGIC);
+		if (error != 0) {
+			fprintf(stderr, "Can't clear metadata on %s: %s.\n",
+			    name, strerror(error));
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		if (verbose)
+			printf("Metadata cleared on %s.\n", name);
+	}
+}
+
+static void
+raid5_metadata_dump(const struct g_raid5_metadata *md)
+{
+
+	printf("         Magic string: %s\n", md->md_magic);
+	printf("     Metadata version: %u\n", (u_int)md->md_version);
+	printf("          Device name: %s\n", md->md_name);
+	printf("            Device ID: %u\n", (u_int)md->md_id);
+	printf("          Disk number: %u\n", (u_int)md->md_no);
+	printf("Total number of disks: %u\n", (u_int)md->md_all);
+	printf("        Provider Size: %jd\n", md->md_provsize);
+	printf("             Verified: %jd\n", md->md_verified);
+	printf("                State: %u\n", (u_int)md->md_state);
+	printf("          Stripe size: %u\n", (u_int)md->md_stripesize);
+	printf("               Newest: %u\n", (u_int)md->md_newest);
+	printf("                NoHot: %s\n", md->md_no_hot?"Yes":"No");
+	printf("   Hardcoded provider: %s\n", md->md_provider);
+}
+
+static void
+raid5_dump(struct gctl_req *req)
+{
+	struct g_raid5_metadata md, tmpmd;
+	const char *name;
+	int error, i, nargs;
+
+	nargs = gctl_get_int(req, "nargs");
+	if (nargs < 1) {
+		gctl_error(req, "Too few arguments.");
+		return;
+	}
+
+	for (i = 0; i < nargs; i++) {
+		name = gctl_get_ascii(req, "arg%d", i);
+		error = g_metadata_read(name, (u_char *)&tmpmd, sizeof(tmpmd),
+		    G_RAID5_MAGIC);
+		if (error != 0) {
+			fprintf(stderr, "Can't read metadata from %s: %s.\n",
+			    name, strerror(error));
+			gctl_error(req, "Not fully done.");
+			continue;
+		}
+		raid5_metadata_decode((u_char *)&tmpmd, &md);
+		printf("Metadata on %s:\n", name);
+		raid5_metadata_dump(&md);
+		printf("\n");
+	}
+}

Added: projects/geom_raid5/sbin/geom/class/raid5/graid5.8
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/geom_raid5/sbin/geom/class/raid5/graid5.8	Mon Feb  2 20:51:26 2009	(r188025)
@@ -0,0 +1,309 @@
+.\" Copyright (c) 2006 Arne Woerner <arne_woerner at yahoo.com>
+.\" testing + tuning-tricks: veronica at fluffles.net
+.\" testing: lev at FreeBSD.org
+.\" derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd at FreeBSD.org>)
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $Id: graid5.8,v 1.18 2008/05/22 02:10:47 aw Exp $
+.\"
+.Dd Dec 11, 2006
+.Dt GRAID5 8
+.Os
+.Sh NAME
+.Nm graid5
+.Nd "control utility for raid5 devices"
+.Sh SYNOPSIS
+.Nm
+.Cm destroy
+.Op Fl fvy
+.Ar name ...
+.Nm
+.Cm label
+.Op Fl hnSv
+.Op Fl s Ar stripesize
+.Ar name
+.Ar prov prov ...
+.Nm
+.Cm configure
+.Op Fl hnRS
+.Ar name
+.Nm
+.Cm stop
+.Op Fl fv
+.Ar name ...
+.Nm
+.Cm insert
+.Ar name prov
+.Nm
+.Cm remove
+.Ar name prov
+.Nm
+.Cm clear
+.Op Fl v
+.Ar prov ...
+.Nm
+.Cm dump
+.Ar prov ...
+.Nm
+.Cm list
+.Nm
+.Cm status
+.Nm
+.Cm load
+.Nm
+.Cm unload
+.Sh DESCRIPTION
+The
+.Nm
+utility is used for setting up a RAID-5 on two or more disks.
+The RAID5'ed device can be configured using two different methods:
+.Dq manual
+or
+.Dq automatic .
+When using the
+.Dq manual
+method, no metadata are stored on the devices, so the RAID5
+device has to be configured by hand every time it is needed.
+The
+.Dq automatic
+method uses on-disk metadata to detect devices.
+Once devices are labeled, they will be automatically detected and
+configured.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm destroy"
+.It Cm label
+Set up a RAID5 device from the given devices with the specified
+.Ar name .
+This is the
+.Dq automatic
+method, where metadata are stored in every device's last sector.
+The kernel module
+.Pa geom_raid5.ko
+will be loaded if it is not loaded already.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl s Ar stripesize"
+.It Fl h
+Hardcode providers' names in metadata.
+.It Fl c
+CowOp mode: Complete-Only-Write-Operation --
+dont write if not in status COMPLETE.
+.It Fl S
+SafeOp mode: read the whole stripe for every read and verify parity.
+.It Fl n
+never-hot-mode: A 2 disk graid5 device doesnt need the hot marker,
+if it is used as swap space. Furthermore this flags is useful, if
+a rebuild would be harmful even if a write request was pending.
+.It Fl s Ar stripesize
+Specify stripesize.
+Recommendation: MAXPHYS (currently 128KiB) == stripesize.
+The
+.Ar stripesize
+must be a power of 2 and
+a multiple of the largest sector size of all the providers.
+.El
+.It Cm configure
+Configure an existing graid5 device:
+.Pp
+Options are:
+.Bl -tag -width "Fl h"
+.It Fl h
+Trigger: hardcoded option.
+.It Fl a
+Reset error flag of all disks.
+.It Fl c
+CowOp mode: Complete-Only-Write-Operation --
+dont write if not in status COMPLETE.
+.It Fl n
+Trigger: never-hot-mode option.
+.It Fl S
+Trigger: SafeOp-mode option.
+.It Fl R
+Trigger: start/stop re-sync.
+.El
+.It Cm stop
+Turn off an existing RAID5 device by its
+.Ar name .
+This command does not touch on-disk metadata!
+.Pp
+Options are:
+.Bl -tag -width "Fl y"
+.It Fl f
+Force destroy even if still busy.
+.It Fl y
+Do not do the Yo-Yo effect.
+.El
+.It Cm destroy
+Same as
+.Cm stop .
+.It Cm clear
+Clear metadata on the given devices.
+.It Cm dump
+Dump metadata stored on the given devices.
+.It Cm list
+See
+.Xr geom 8 .
+.It Cm status
+See
+.Xr geom 8 .
+.It Cm load
+See
+.Xr geom 8 .
+.It Cm unload
+See
+.Xr geom 8 .
+.El
+.Pp
+Additional options:
+.Bl -tag -width ".Fl f
+.It Fl f
+Force the removal of the specified striped device.
+.It Fl v
+Be more verbose.
+.El
+.Sh SYSCTL VARIABLES
+The following
+.Xr sysctl 8
+variables can be used to control the behavior of the
+.Nm RAID5
+GEOM class.
+The default value is shown next to each variable.
+.Bl -tag -width indent
+.It Va kern.geom.raid5.debug : No 0
+Debug level of the
+.Nm RAID5
+GEOM class.
+This can be set to a number between 0 and 3 inclusive.
+If set to 0 minimal debug information is printed, and if set to 3 the
+maximum amount of debug information is printed.
+.It Va kern.geom.raid5.mhm : No 0 (read-only)
+Number of malloc hamster cache misses.
+.It Va kern.geom.raid5.mhh : No 0 (read-only)
+Number of malloc hamster cache hits.
+.It Va kern.geom.raid5.maxmem : No 8000000 (tunable)
+This variable can be set any time to any 32bit signed integer value.
+It is cropped apropriately (0..128MB) and interpreted as bytes.
+.It Va kern.geom.raid5.wqf : No 0 (read-only)
+This value shows the number of write requests that were issued early due to
+a conflicting read request.
+.It Va kern.geom.raid5.wqp : No 0 (read-only)
+This value shows the maximum number of pending write requests so far.
+.It Va kern.geom.raid5.blked1 : No 0 (read-only)
+This value shows the number of new write requests that could not be combined
+because the corresponding area already has an issued but incomplete
+write request.
+.It Va kern.geom.raid5.blked2 : No 0 (read-only)
+This value shows number of due write (2-phase) requests, that were blocked by
+another such request due to parity area conflict.
+.It Va kern.geom.raid5.dsk_ok : No 50 (read-only)
+This value shows the healthiness of the underlying devices.
+50 is perfect. 40 or lower triggers a soft-device-remove.
+0 causes an error announced to the upper layer.
+.It Va kern.geom.raid5.veri_nice : No 100 (tunable)
+This value (milli seconds) enforces a delay after a user-land read request
+for internal verify requests, which are certainly quite hindering for
+user-land requests, because they read all disks and in some cases even
+write a disk.
+.It Va kern.geom.raid5.veri_w : No 0 (read-only)
+This value shows the number of parity-failures (during rebuild)
+.It Va kern.geom.raid5.veri : No 0 (read-only)
+This value shows the number of parity checks (during rebuild).
+.It Va kern.geom.raid5.wreq2_cnt : No 0 (read-only)
+Number of 2-phase writes (1. phase: read data&parity (or "other" data in case
+of three disks); 2. phase: write data&parity).
+.It Va kern.geom.raid5.wreq1_cnt : No 0 (read-only)
+Number of 1-phase writes (sufficiently long chunks can be written in one
+phase).
+.It Va kern.geom.raid5.wreq_cnt : No 0 (read-only)
+Write requests started by upper layer.
+.It Va kern.geom.raid5.rreq_cnt : No 0 (read-only)
+Read requests started by upper layer.
+.It Va kern.geom.raid5.maxwql : No 0 (tunable)
+This variable gives a hint for the maximum length of the write queue.
+Write requests are queued until they are long enough or old enough or
+until there are too many of them.
+.It Va kern.geom.raid5.wdt : No 10 (tunable)
+This variable determines the maximum age of a write request before it
+is issued.
+.It Va kern.geom.raid5.tooc : No 3 (tunable)
+This variable determines the time-out-on-create. The provider is not
+created before all consumers are present or the timeout is over.
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, and 1 if the command fails.
+.Sh EXAMPLES
+The following example shows how to set up a RAID5 device from four disks with a
+128KB stripe size for automatic configuration,
+create a file system on it,
+and mount it:
+.Bd -literal -offset indent
+graid5 label -v -s 131072 data /dev/da0 /dev/da1 /dev/da2 /dev/da3
+newfs /dev/raid5/data
+mount /dev/raid5/data /mnt
+[...]
+umount /mnt
+graid5 stop data
+graid5 unload
+.Ed
+.Sh COMPATIBILITY
+The
+.Nm
+interleave is in number of bytes,
+unlike
+.Xr ccdconfig 8
+and
+.Xr atacontrol 8
+which use the number of sectors.
+A
+.Xr ccdconfig 8
+.Ar ileave
+of
+.Ql 128
+is 64 KB (128 512B sectors).
+The same stripe interleave would be specified as
+.Ql 65536
+for
+.Nm .
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr loader.conf 5 ,
+.Xr atacontrol 8 ,
+.Xr ccdconfig 8 ,
+.Xr geom 8 ,
+.Xr mount 8 ,
+.Xr newfs 8 ,
+.Xr sysctl 8 ,
+.Xr umount 8 ,
+.Xr vinum 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in
+.Fx 5.3 .
+.Sh AUTHORS
+.An Arne W?rner Aq arne_woerner at yahoo.com
+.An testing & tuning: Aq veronica at fluffles.net

Added: projects/geom_raid5/sys/geom/raid5/g_raid5.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/geom_raid5/sys/geom/raid5/g_raid5.c	Mon Feb  2 20:51:26 2009	(r188025)
@@ -0,0 +1,4174 @@
+/*
+ * Copyright (c) 2006 Arne Woerner <arne_woerner at yahoo.com>
+ * testing + tuning-tricks: veronica at fluffles.net
+ * derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd at FreeBSD.org>)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$Id: g_raid5.c,v 1.271.1.274 2008/07/29 13:58:03 aw Exp aw $");
+
+#ifdef KASSERT
+#define MYKASSERT(a,b) KASSERT(a,b)
+#else
+#define MYKASSERT(a,b) do {if (!(a)) { G_RAID5_DEBUG(0,"KASSERT in line %d.",__LINE__); panic b;}} while (0)
+#endif
+#define ORDER(a,b) do {if (a > b) { int tmp = a; a = b; b = tmp; }} while(0)
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/eventhandler.h>
+#include <sys/sched.h>
+#include <geom/geom.h>
+#include <geom/raid5/g_raid5.h>
+
+/*
+ * our sysctl-s
+ */
+SYSCTL_DECL(_kern_geom);
+SYSCTL_NODE(_kern_geom, OID_AUTO, raid5, CTLFLAG_RW, 0, "GEOM_RAID5 stuff");
+static u_int g_raid5_cache_size_mem = 64*1024*1024;
+TUNABLE_INT("kern.geom.raid5.csm", &g_raid5_cache_size_mem);
+SYSCTL_INT(_kern_geom_raid5, OID_AUTO, csm, CTLFLAG_RW, &g_raid5_cache_size_mem,
+      0, "cache size ((<disk count-1)*<stripe size> per bucket) in bytes");
+static int g_raid5_cache_size = -5;
+TUNABLE_INT("kern.geom.raid5.cs", &g_raid5_cache_size);
+SYSCTL_INT(_kern_geom_raid5, OID_AUTO, cs, CTLFLAG_RW, &g_raid5_cache_size,0,
+      "cache size ((<disk count-1)*<stripe size> per bucket)");
+static u_int g_raid5_debug = 0;
+TUNABLE_INT("kern.geom.raid5.debug", &g_raid5_debug);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, debug, CTLFLAG_RW, &g_raid5_debug, 0,
+    "Debug level");
+static u_int g_raid5_tooc = 5;
+TUNABLE_INT("kern.geom.raid5.tooc", &g_raid5_tooc);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, tooc, CTLFLAG_RW, &g_raid5_tooc, 0,
+    "timeout on create (in order to avoid unnecessary rebuilds on reboot)");
+static u_int g_raid5_wdt = 5;
+TUNABLE_INT("kern.geom.raid5.wdt", &g_raid5_wdt);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wdt, CTLFLAG_RW, &g_raid5_wdt, 0,
+    "write request delay (in seconds)");
+static u_int g_raid5_maxwql = 25;
+TUNABLE_INT("kern.geom.raid5.maxwql", &g_raid5_maxwql);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, maxwql, CTLFLAG_RW, &g_raid5_maxwql, 0,
+    "max wait queue length");
+static u_int g_raid5_veri_fac = 25;
+TUNABLE_INT("kern.geom.raid5.veri_fac", &g_raid5_veri_fac);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri_fac, CTLFLAG_RW, &g_raid5_veri_fac,
+    0, "veri brake factor in case of veri_min * X < veri_max");
+static u_int g_raid5_veri_nice = 100;
+TUNABLE_INT("kern.geom.raid5.veri_nice", &g_raid5_veri_nice);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO,veri_nice, CTLFLAG_RW,&g_raid5_veri_nice,
+    0, "wait this many milli seconds after last user-read (less than 1sec)");
+static u_int g_raid5_vsc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri, CTLFLAG_RD, &g_raid5_vsc, 0,
+    "verify stripe count");
+static u_int g_raid5_vwc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri_w, CTLFLAG_RD, &g_raid5_vwc, 0,
+    "verify write count");
+static u_int g_raid5_rrc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, rreq_cnt, CTLFLAG_RD, &g_raid5_rrc, 0,
+    "read request count");
+static u_int g_raid5_wrc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq_cnt, CTLFLAG_RD, &g_raid5_wrc, 0,
+    "write request count");
+static u_int g_raid5_w1rc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq1_cnt, CTLFLAG_RD, &g_raid5_w1rc, 0,
+    "write request count (1-phase)");
+static u_int g_raid5_w2rc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq2_cnt, CTLFLAG_RD, &g_raid5_w2rc, 0,
+    "write request count (2-phase)");
+static u_int g_raid5_disks_ok = 50;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, dsk_ok, CTLFLAG_RD, &g_raid5_disks_ok,0,
+    "repeat EIO'ed request?");
+static u_int g_raid5_blked1 = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, blked1, CTLFLAG_RD, &g_raid5_blked1,0,
+    "1. kind block count");
+static u_int g_raid5_blked2 = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, blked2, CTLFLAG_RD, &g_raid5_blked2,0,
+    "2. kind block count");
+static u_int g_raid5_wqp = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wqp, CTLFLAG_RD, &g_raid5_wqp,0,
+    "max. write queue length");
+static u_int g_raid5_mhm = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, mhm, CTLFLAG_RD, &g_raid5_mhm,0,
+    "memory hamster miss");
+static u_int g_raid5_mhh = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, mhh, CTLFLAG_RD, &g_raid5_mhh,0,
+    "memory hamster hit");
+
+static MALLOC_DEFINE(M_RAID5, "raid5_data", "GEOM_RAID5 Data");
+
+static int g_raid5_destroy(struct g_raid5_softc *sc,
+                           boolean_t force, boolean_t noyoyo);
+static int g_raid5_destroy_geom(struct gctl_req *req, struct g_class *mp,
+                                struct g_geom *gp);
+
+static g_taste_t g_raid5_taste;
+static g_ctl_req_t g_raid5_config;
+static g_dumpconf_t g_raid5_dumpconf;
+
+static eventhandler_tag g_raid5_post_sync = NULL;
+
+static void g_raid5_init(struct g_class *mp);
+static void g_raid5_fini(struct g_class *mp);
+
+struct g_class g_raid5_class = {
+	.name = G_RAID5_CLASS_NAME,
+	.version = G_VERSION,
+	.ctlreq = g_raid5_config,
+	.taste = g_raid5_taste,
+	.destroy_geom = g_raid5_destroy_geom,
+	.init = g_raid5_init,
+	.fini = g_raid5_fini
+};
+
+/* GCD & LCM */
+static __inline u_int
+gcd(u_int a, u_int b)
+{
+	while (b != 0) {
+		u_int c = a;
+		a = b;
+		b = c % b;
+	}
+	return a;
+}
+static __inline u_int
+g_raid5_lcm(u_int a, u_int b)
+{ return ((a * b) / gcd(a, b)); }
+
+/*
+ * memory hamster stuff
+ * memory hamster stores in the first sizeof(int) bytes of each chunk
+ * that is requested * by malloc() the size of that chunk,
+ * while the bio-s only see that chunk at offset &[sizeof(int)]...
+ */
+static __inline int
+g_raid5_mh_sz_by_a(caddr_t m)
+{ return ((int*)m)[-1]; }
+static __inline int
+g_raid5_mh_sz_by_i(struct g_raid5_softc *sc, int i)
+{ return g_raid5_mh_sz_by_a(sc->mhl[i]); }
+static __inline void
+g_raid5_mh_sz(caddr_t m, int l)
+{ ((int*)m)[-1] = l; }
+static __inline void
+g_raid5_free_by_a(caddr_t m)
+{ free(m - sizeof(int), M_RAID5); }
+static __inline void
+g_raid5_free_by_i(struct g_raid5_softc *sc, int mi)
+{ g_raid5_free_by_a(sc->mhl[mi]); }
+static void
+g_raid5_mh_all_free(struct g_raid5_softc *sc) {
+	for (int i=0; i<sc->mhc; i++)
+		g_raid5_free_by_i(sc,i);
+	sc->mhc = 0;
+}
+static caddr_t
+g_raid5_malloc(struct g_raid5_softc *sc, int l, int force)
+{
+	mtx_lock(&sc->mh_mtx);
+	int h = l*2;
+	int fi = -1;
+	int fl = -1;
+	int i;
+	for (i=0; i<sc->mhc; i++) {
+		int ml = g_raid5_mh_sz_by_i(sc,i);
+		if (ml < l || ml > h)
+			continue;
+		if (fl > 0 && ml >= fl)
+			continue;
+		fl = ml;
+		fi = i;
+		if (ml == l)
+			break;
+	}
+	caddr_t m;
+	if (fi >= 0) {
+		m = sc->mhl[fi];
+		sc->mhc--;
+		if (fi < sc->mhc)
+			sc->mhl[fi] = sc->mhl[sc->mhc];
+		g_raid5_mhh++;
+		mtx_unlock(&sc->mh_mtx);
+	} else {
+		g_raid5_mhm++;
+		mtx_unlock(&sc->mh_mtx);
+		m = malloc(l+sizeof(fl), M_RAID5, M_NOWAIT);
+		if (m == NULL && force) {
+			g_raid5_mh_all_free(sc);
+			m = malloc(l+sizeof(fl), M_RAID5, M_WAITOK);
+		}
+		if (m != NULL) {
+			m += sizeof(fl);
+			g_raid5_mh_sz(m,l);
+		}
+	}
+	return m;
+}
+static void
+g_raid5_free(struct g_raid5_softc *sc, caddr_t m)
+{
+	mtx_lock(&sc->mh_mtx);
+	MYKASSERT(((int*)m)[-1] > 0, ("this is no mem hamster chunk."));
+	if (sc->mhc < sc->mhs) {
+		sc->mhl[sc->mhc] = m;
+		sc->mhc++;
+	} else {
+		int l = g_raid5_mh_sz_by_a(m);
+		int mi = -1;
+		int ml = -1;
+		for (int i=0; i<sc->mhc; i++) {
+			int nl = g_raid5_mh_sz_by_i(sc,i);
+			if (nl >= l)
+				continue;
+			if (ml > 0 && ml <= nl)
+				continue;
+			mi = i;
+			ml = nl;
+		}
+		if (mi < 0)
+			g_raid5_free_by_a(m);
+		else {
+			g_raid5_free_by_i(sc,mi);
+			sc->mhl[mi] = m;
+		}
+	}
+	mtx_unlock(&sc->mh_mtx);
+}
+static void
+g_raid5_mh_destroy(struct g_raid5_softc *sc)
+{
+	g_raid5_mh_all_free(sc);
+	free(sc->mhl, M_RAID5);
+	mtx_destroy(&sc->mh_mtx);
+}
+
+/*
+ * cache entry manager
+ * implements a simple queue (fst; for next bio it (ab)uses bio's bio_queue)
+ */
+static __inline int
+g_raid5_ce_em(struct g_raid5_cache_entry *ce)
+{ return ce->fst == NULL; }
+static __inline struct g_raid5_cache_entry *
+g_raid5_ce_by_i(struct g_raid5_softc *sc, int i)
+{ return sc->ce + i; }
+static struct g_raid5_cache_entry *
+g_raid5_ce_by_sno(struct g_raid5_softc *sc, off_t s)
+{
+	struct g_raid5_cache_entry *fce = NULL;
+	MYKASSERT(s >= 0, ("s must not be negative."));
+	s++;
+	int i = s % sc->cs;
+	for (int j=sc->cs; j>0; j--) {
+		struct g_raid5_cache_entry *ce = g_raid5_ce_by_i(sc,i);
+		if (ce->sno == s)
+			return ce;
+		if (fce==NULL && ce->sno == 0)
+			fce = ce;
+		i++;
+		if (i == sc->cs)
+			i = 0;
+	}
+	if (fce == NULL) {
+		sc->cfc++;
+		return NULL;
+	}
+	MYKASSERT(fce->fst == NULL, ("ce not free."));
+	MYKASSERT(fce->dc == 0, ("%p dc inconsistency %d.",fce,fce->dc));
+	MYKASSERT(fce->sno == 0, ("ce not free."));
+	fce->sno = s;
+	return fce;
+}
+static __inline struct g_raid5_cache_entry *
+g_raid5_ce_by_off(struct g_raid5_softc *sc, off_t o)
+{ return g_raid5_ce_by_sno(sc, o/sc->fsl); }
+static __inline struct g_raid5_cache_entry *
+g_raid5_ce_by_bio(struct g_raid5_softc *sc, struct bio *bp)
+{ return g_raid5_ce_by_off(sc, bp->bio_offset); }
+#define G_RAID5_C_TRAVERSE(AAA,BBB,CCC) \
+	for (int i = AAA->cs-1; i >= 0; i--) \
+		G_RAID5_CE_TRAVERSE((CCC=g_raid5_ce_by_i(sc,i)), BBB)
+#define G_RAID5_C_TRAVSAFE(AAA,BBB,CCC) \
+	for (int i = AAA->cs-1; i >= 0; i--) \
+		G_RAID5_CE_TRAVSAFE((CCC=g_raid5_ce_by_i(sc,i)), BBB)
+#define G_RAID5_CE_TRAVERSE(AAA, BBB) \
+	for (BBB = AAA->fst; BBB != NULL; BBB = g_raid5_q_nx(BBB))
+#define G_RAID5_CE_TRAVSAFE(AAA, BBB) \
+	for (BBB = AAA->fst, BBB##_nxt = g_raid5_q_nx(BBB); \
+	     BBB != NULL; \
+	     BBB = BBB##_nxt, BBB##_nxt = g_raid5_q_nx(BBB))
+static __inline void
+g_raid5_dc_inc(struct g_raid5_softc *sc, struct g_raid5_cache_entry *ce)
+{
+	MYKASSERT(ce->dc >= 0 && sc->dc >= 0 && sc->wqp >= 0, ("cannot happen."));
+	if (ce->dc == 0)
+		sc->dc++;
+	ce->dc++;
+	sc->wqp++;
+}
+static __inline void
+g_raid5_dc_dec(struct g_raid5_softc *sc, struct g_raid5_cache_entry *ce)
+{
+	MYKASSERT(ce->dc > 0 && sc->dc > 0 && sc->wqp > 0, ("cannot happen."));
+	ce->dc--;
+	if (ce->dc == 0)
+		sc->dc--;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-projects mailing list