svn commit: r188025 - in projects/geom_raid5: sbin/geom/class/raid5
sys/geom/raid5 sys/modules/geom/geom_raid5
Ulf Lilleengen
lulf at FreeBSD.org
Mon Feb 2 12:51:30 PST 2009
Author: lulf
Date: Mon Feb 2 20:51:26 2009
New Revision: 188025
URL: http://svn.freebsd.org/changeset/base/188025
Log:
- Import the geom_raid5 PP sources with a few modifications such as adding
macros for metadata offsets as well as converting kthread to kproc.
Added:
projects/geom_raid5/sbin/geom/class/raid5/
projects/geom_raid5/sbin/geom/class/raid5/Makefile (contents, props changed)
projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c (contents, props changed)
projects/geom_raid5/sbin/geom/class/raid5/graid5.8
projects/geom_raid5/sys/geom/raid5/
projects/geom_raid5/sys/geom/raid5/g_raid5.c (contents, props changed)
projects/geom_raid5/sys/geom/raid5/g_raid5.h (contents, props changed)
projects/geom_raid5/sys/modules/geom/geom_raid5/
projects/geom_raid5/sys/modules/geom/geom_raid5/Makefile (contents, props changed)
Added: projects/geom_raid5/sbin/geom/class/raid5/Makefile
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/geom_raid5/sbin/geom/class/raid5/Makefile Mon Feb 2 20:51:26 2009 (r188025)
@@ -0,0 +1,7 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../misc
+
+CLASS= raid5
+
+.include <bsd.lib.mk>
Added: projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c Mon Feb 2 20:51:26 2009 (r188025)
@@ -0,0 +1,325 @@
+/*-
+ * Copyright (c) 2006 Arne Woerner <arne_woerner at yahoo.com>
+ * testing + tuning-tricks: veronica at fluffles.net
+ * derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd at FreeBSD.org>)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$Id: geom_raid5.c,v 1.33.1.12 2007/11/12 20:24:45 aw Exp aw $");
+
+#include <sys/param.h>
+#include <errno.h>
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <libgeom.h>
+#include <geom/raid5/g_raid5.h>
+
+#include "core/geom.h"
+#include "misc/subr.h"
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_RAID5_VERSION;
+static intmax_t default_stripesize = 64*1024;
+
+static void raid5_main(struct gctl_req *req, unsigned flags);
+static void raid5_clear(struct gctl_req *req);
+static void raid5_dump(struct gctl_req *req);
+static void raid5_label(struct gctl_req *req);
+
+#ifndef G_TYPE_BOOL
+#define G_TYPE_BOOL G_TYPE_NONE
+#endif
+
+#if __FreeBSD_version >= 700000
+#define GCMD67 NULL,
+#else
+#define GCMD67
+#endif
+struct g_command class_commands[] = {
+ { "clear", G_FLAG_VERBOSE, raid5_main, G_NULL_OPTS, GCMD67
+ "[-v] prov ..."
+ },
+ { "destroy", G_FLAG_VERBOSE, NULL,
+ {
+ { 'f', "force", NULL, G_TYPE_BOOL },
+ { 'y', "noyoyo", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ }, GCMD67
+ "[-fvy] name ..."
+ },
+ { "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, GCMD67
+ "[-v] name prov"
+ },
+ { "insert", G_FLAG_VERBOSE, NULL,
+ { { 'h', "hardcode", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL}, GCMD67
+ "[-hv] name prov"
+ },
+ { "configure", G_FLAG_VERBOSE, NULL,
+ { { 'h', "hardcode", NULL, G_TYPE_BOOL },
+ { 'a', "activate", NULL, G_TYPE_BOOL },
+ { 'c', "cowop", NULL, G_TYPE_BOOL },
+ { 'n', "nohot", NULL, G_TYPE_BOOL },
+ { 'S', "safeop", NULL, G_TYPE_BOOL },
+ { 'R', "rebuild", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL}, GCMD67
+ "[-RSchnva] name"
+ },
+ { "dump", 0, raid5_main, G_NULL_OPTS, GCMD67
+ "prov ..."
+ },
+ { "label", G_FLAG_VERBOSE | G_FLAG_LOADKLD, raid5_main,
+ { { 'c', "cowop", NULL, G_TYPE_BOOL },
+ { 'h', "hardcode", NULL, G_TYPE_BOOL },
+ { 'n', "nohot", NULL, G_TYPE_BOOL },
+ { 's', "stripesize", &default_stripesize, G_TYPE_NUMBER },
+ { 'S', "safeop", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL}, GCMD67
+ "[-chvn] [-s stripesize] [-S] name prov ..."
+ },
+ { "stop", G_FLAG_VERBOSE, NULL,
+ {
+ { 'f', "force", NULL, G_TYPE_BOOL },
+ { 'y', "noyoyo", NULL, G_TYPE_BOOL },
+ G_OPT_SENTINEL
+ }, GCMD67
+ "[-fv] name ..."
+ },
+ G_CMD_SENTINEL
+};
+
+static int verbose = 0;
+
+static void
+raid5_main(struct gctl_req *req, unsigned flags)
+{
+ const char *name;
+
+ if ((flags & G_FLAG_VERBOSE) != 0)
+ verbose = 1;
+
+ name = gctl_get_ascii(req, "verb");
+ if (name == NULL) {
+ gctl_error(req, "No '%s' argument.", "verb");
+ return;
+ }
+ if (strcmp(name, "label") == 0)
+ raid5_label(req);
+ else if (strcmp(name, "clear") == 0)
+ raid5_clear(req);
+ else if (strcmp(name, "dump") == 0)
+ raid5_dump(req);
+ else
+ gctl_error(req, "Unknown command: %s.", name);
+}
+
+static void
+raid5_label(struct gctl_req *req)
+{
+ struct g_raid5_metadata md;
+ const char *name;
+ int error, i, hardcode, nargs, safeop, nohot, cowop;
+ intmax_t stripesize;
+
+ nargs = gctl_get_int(req, "nargs");
+ if (nargs < 3) {
+ gctl_error(req, "Too few arguments.");
+ return;
+ }
+ nohot = gctl_get_int(req, "nohot");
+ hardcode = gctl_get_int(req, "hardcode");
+ safeop = gctl_get_int(req, "safeop");
+ cowop = gctl_get_int(req, "cowop");
+ stripesize = gctl_get_intmax(req, "stripesize");
+ if (stripesize > 256*1024) {
+ gctl_error(req, "stripesize must be less than 512KB.");
+ return;
+ }
+ if (!powerof2(stripesize)) {
+ int cs;
+ for (cs=4096; cs < stripesize; cs<<=1);
+ gctl_error(req, "Invalid stripe size: %jd, recommended: %d.",
+ stripesize, cs);
+ return;
+ }
+
+
+ /*
+ * Clear last sector first to spoil all components if device exists.
+ */
+ for (i = 1; i < nargs; i++) {
+ name = gctl_get_ascii(req, "arg%d", i);
+ error = g_metadata_clear(name, NULL);
+ if (error != 0) {
+ gctl_error(req, "Can't store metadata on %s: %s.", name,
+ strerror(error));
+ return;
+ }
+ }
+
+ strlcpy(md.md_magic, G_RAID5_MAGIC, sizeof(md.md_magic));
+ md.md_version = G_RAID5_VERSION;
+ name = gctl_get_ascii(req, "arg0");
+ strlcpy(md.md_name, name, sizeof(md.md_name));
+ md.md_id = arc4random();
+ md.md_all = nargs - 1;
+ md.md_stripesize = stripesize;
+ md.md_verified = 0;
+ md.md_newest = -1;
+ md.md_no_hot = nohot;
+ md.md_state = nohot ? G_RAID5_STATE_CALM :
+ (G_RAID5_STATE_HOT|G_RAID5_STATE_VERIFY);
+ if (safeop)
+ md.md_state |= G_RAID5_STATE_SAFEOP;
+ if (cowop)
+ md.md_state |= G_RAID5_STATE_COWOP;
+
+ /*
+ * Ok, store metadata.
+ */
+ int64_t min = -1;
+ int64_t waste = 0;
+ for (i = 1; i < nargs; i++) {
+ u_char sector[512];
+ int64_t pmin;
+
+ name = gctl_get_ascii(req, "arg%d", i);
+ md.md_no = i - 1;
+ if (!hardcode)
+ bzero(md.md_provider, sizeof(md.md_provider));
+ else {
+ if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0)
+ name += strlen(_PATH_DEV);
+ strlcpy(md.md_provider, name, sizeof(md.md_provider));
+ }
+ md.md_provsize = g_get_mediasize(name);
+ pmin = md.md_provsize - g_get_sectorsize(name);
+ waste += pmin % stripesize;
+ if (min < 0)
+ min = pmin;
+ else if (min > pmin) {
+ waste += (i-1) * (min - pmin);
+ min = pmin;
+ } else
+ waste += pmin - min;
+ if (md.md_provsize == 0) {
+ fprintf(stderr, "Can't get mediasize of %s: %s.\n",
+ name, strerror(errno));
+ gctl_error(req, "Not fully done.");
+ continue;
+ }
+ raid5_metadata_encode(&md, sector);
+ error = g_metadata_store(name, sector, sizeof(sector));
+ if (error != 0) {
+ fprintf(stderr, "Can't store metadata on %s: %s.\n",
+ name, strerror(error));
+ gctl_error(req, "Not fully done.");
+ continue;
+ }
+ if (verbose)
+ printf("Metadata value stored on %s.\n", name);
+ }
+ if (waste > 0)
+ printf("Wasting %jd bytes (>=%jdGB).\n", waste, waste>>(3*10));
+}
+
+static void
+raid5_clear(struct gctl_req *req)
+{
+ const char *name;
+ int error, i, nargs;
+
+ nargs = gctl_get_int(req, "nargs");
+ if (nargs < 1) {
+ gctl_error(req, "Too few arguments.");
+ return;
+ }
+
+ for (i = 0; i < nargs; i++) {
+ name = gctl_get_ascii(req, "arg%d", i);
+ error = g_metadata_clear(name, G_RAID5_MAGIC);
+ if (error != 0) {
+ fprintf(stderr, "Can't clear metadata on %s: %s.\n",
+ name, strerror(error));
+ gctl_error(req, "Not fully done.");
+ continue;
+ }
+ if (verbose)
+ printf("Metadata cleared on %s.\n", name);
+ }
+}
+
+static void
+raid5_metadata_dump(const struct g_raid5_metadata *md)
+{
+
+ printf(" Magic string: %s\n", md->md_magic);
+ printf(" Metadata version: %u\n", (u_int)md->md_version);
+ printf(" Device name: %s\n", md->md_name);
+ printf(" Device ID: %u\n", (u_int)md->md_id);
+ printf(" Disk number: %u\n", (u_int)md->md_no);
+ printf("Total number of disks: %u\n", (u_int)md->md_all);
+ printf(" Provider Size: %jd\n", md->md_provsize);
+ printf(" Verified: %jd\n", md->md_verified);
+ printf(" State: %u\n", (u_int)md->md_state);
+ printf(" Stripe size: %u\n", (u_int)md->md_stripesize);
+ printf(" Newest: %u\n", (u_int)md->md_newest);
+ printf(" NoHot: %s\n", md->md_no_hot?"Yes":"No");
+ printf(" Hardcoded provider: %s\n", md->md_provider);
+}
+
+static void
+raid5_dump(struct gctl_req *req)
+{
+ struct g_raid5_metadata md, tmpmd;
+ const char *name;
+ int error, i, nargs;
+
+ nargs = gctl_get_int(req, "nargs");
+ if (nargs < 1) {
+ gctl_error(req, "Too few arguments.");
+ return;
+ }
+
+ for (i = 0; i < nargs; i++) {
+ name = gctl_get_ascii(req, "arg%d", i);
+ error = g_metadata_read(name, (u_char *)&tmpmd, sizeof(tmpmd),
+ G_RAID5_MAGIC);
+ if (error != 0) {
+ fprintf(stderr, "Can't read metadata from %s: %s.\n",
+ name, strerror(error));
+ gctl_error(req, "Not fully done.");
+ continue;
+ }
+ raid5_metadata_decode((u_char *)&tmpmd, &md);
+ printf("Metadata on %s:\n", name);
+ raid5_metadata_dump(&md);
+ printf("\n");
+ }
+}
Added: projects/geom_raid5/sbin/geom/class/raid5/graid5.8
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/geom_raid5/sbin/geom/class/raid5/graid5.8 Mon Feb 2 20:51:26 2009 (r188025)
@@ -0,0 +1,309 @@
+.\" Copyright (c) 2006 Arne Woerner <arne_woerner at yahoo.com>
+.\" testing + tuning-tricks: veronica at fluffles.net
+.\" testing: lev at FreeBSD.org
+.\" derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd at FreeBSD.org>)
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $Id: graid5.8,v 1.18 2008/05/22 02:10:47 aw Exp $
+.\"
+.Dd Dec 11, 2006
+.Dt GRAID5 8
+.Os
+.Sh NAME
+.Nm graid5
+.Nd "control utility for raid5 devices"
+.Sh SYNOPSIS
+.Nm
+.Cm destroy
+.Op Fl fvy
+.Ar name ...
+.Nm
+.Cm label
+.Op Fl hnSv
+.Op Fl s Ar stripesize
+.Ar name
+.Ar prov prov ...
+.Nm
+.Cm configure
+.Op Fl hnRS
+.Ar name
+.Nm
+.Cm stop
+.Op Fl fv
+.Ar name ...
+.Nm
+.Cm insert
+.Ar name prov
+.Nm
+.Cm remove
+.Ar name prov
+.Nm
+.Cm clear
+.Op Fl v
+.Ar prov ...
+.Nm
+.Cm dump
+.Ar prov ...
+.Nm
+.Cm list
+.Nm
+.Cm status
+.Nm
+.Cm load
+.Nm
+.Cm unload
+.Sh DESCRIPTION
+The
+.Nm
+utility is used for setting up a RAID-5 on two or more disks.
+The RAID5'ed device can be configured using two different methods:
+.Dq manual
+or
+.Dq automatic .
+When using the
+.Dq manual
+method, no metadata are stored on the devices, so the RAID5
+device has to be configured by hand every time it is needed.
+The
+.Dq automatic
+method uses on-disk metadata to detect devices.
+Once devices are labeled, they will be automatically detected and
+configured.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm destroy"
+.It Cm label
+Set up a RAID5 device from the given devices with the specified
+.Ar name .
+This is the
+.Dq automatic
+method, where metadata are stored in every device's last sector.
+The kernel module
+.Pa geom_raid5.ko
+will be loaded if it is not loaded already.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl s Ar stripesize"
+.It Fl h
+Hardcode providers' names in metadata.
+.It Fl c
+CowOp mode: Complete-Only-Write-Operation --
+dont write if not in status COMPLETE.
+.It Fl S
+SafeOp mode: read the whole stripe for every read and verify parity.
+.It Fl n
+never-hot-mode: A 2 disk graid5 device doesnt need the hot marker,
+if it is used as swap space. Furthermore this flags is useful, if
+a rebuild would be harmful even if a write request was pending.
+.It Fl s Ar stripesize
+Specify stripesize.
+Recommendation: MAXPHYS (currently 128KiB) == stripesize.
+The
+.Ar stripesize
+must be a power of 2 and
+a multiple of the largest sector size of all the providers.
+.El
+.It Cm configure
+Configure an existing graid5 device:
+.Pp
+Options are:
+.Bl -tag -width "Fl h"
+.It Fl h
+Trigger: hardcoded option.
+.It Fl a
+Reset error flag of all disks.
+.It Fl c
+CowOp mode: Complete-Only-Write-Operation --
+dont write if not in status COMPLETE.
+.It Fl n
+Trigger: never-hot-mode option.
+.It Fl S
+Trigger: SafeOp-mode option.
+.It Fl R
+Trigger: start/stop re-sync.
+.El
+.It Cm stop
+Turn off an existing RAID5 device by its
+.Ar name .
+This command does not touch on-disk metadata!
+.Pp
+Options are:
+.Bl -tag -width "Fl y"
+.It Fl f
+Force destroy even if still busy.
+.It Fl y
+Do not do the Yo-Yo effect.
+.El
+.It Cm destroy
+Same as
+.Cm stop .
+.It Cm clear
+Clear metadata on the given devices.
+.It Cm dump
+Dump metadata stored on the given devices.
+.It Cm list
+See
+.Xr geom 8 .
+.It Cm status
+See
+.Xr geom 8 .
+.It Cm load
+See
+.Xr geom 8 .
+.It Cm unload
+See
+.Xr geom 8 .
+.El
+.Pp
+Additional options:
+.Bl -tag -width ".Fl f
+.It Fl f
+Force the removal of the specified striped device.
+.It Fl v
+Be more verbose.
+.El
+.Sh SYSCTL VARIABLES
+The following
+.Xr sysctl 8
+variables can be used to control the behavior of the
+.Nm RAID5
+GEOM class.
+The default value is shown next to each variable.
+.Bl -tag -width indent
+.It Va kern.geom.raid5.debug : No 0
+Debug level of the
+.Nm RAID5
+GEOM class.
+This can be set to a number between 0 and 3 inclusive.
+If set to 0 minimal debug information is printed, and if set to 3 the
+maximum amount of debug information is printed.
+.It Va kern.geom.raid5.mhm : No 0 (read-only)
+Number of malloc hamster cache misses.
+.It Va kern.geom.raid5.mhh : No 0 (read-only)
+Number of malloc hamster cache hits.
+.It Va kern.geom.raid5.maxmem : No 8000000 (tunable)
+This variable can be set any time to any 32bit signed integer value.
+It is cropped apropriately (0..128MB) and interpreted as bytes.
+.It Va kern.geom.raid5.wqf : No 0 (read-only)
+This value shows the number of write requests that were issued early due to
+a conflicting read request.
+.It Va kern.geom.raid5.wqp : No 0 (read-only)
+This value shows the maximum number of pending write requests so far.
+.It Va kern.geom.raid5.blked1 : No 0 (read-only)
+This value shows the number of new write requests that could not be combined
+because the corresponding area already has an issued but incomplete
+write request.
+.It Va kern.geom.raid5.blked2 : No 0 (read-only)
+This value shows number of due write (2-phase) requests, that were blocked by
+another such request due to parity area conflict.
+.It Va kern.geom.raid5.dsk_ok : No 50 (read-only)
+This value shows the healthiness of the underlying devices.
+50 is perfect. 40 or lower triggers a soft-device-remove.
+0 causes an error announced to the upper layer.
+.It Va kern.geom.raid5.veri_nice : No 100 (tunable)
+This value (milli seconds) enforces a delay after a user-land read request
+for internal verify requests, which are certainly quite hindering for
+user-land requests, because they read all disks and in some cases even
+write a disk.
+.It Va kern.geom.raid5.veri_w : No 0 (read-only)
+This value shows the number of parity-failures (during rebuild)
+.It Va kern.geom.raid5.veri : No 0 (read-only)
+This value shows the number of parity checks (during rebuild).
+.It Va kern.geom.raid5.wreq2_cnt : No 0 (read-only)
+Number of 2-phase writes (1. phase: read data&parity (or "other" data in case
+of three disks); 2. phase: write data&parity).
+.It Va kern.geom.raid5.wreq1_cnt : No 0 (read-only)
+Number of 1-phase writes (sufficiently long chunks can be written in one
+phase).
+.It Va kern.geom.raid5.wreq_cnt : No 0 (read-only)
+Write requests started by upper layer.
+.It Va kern.geom.raid5.rreq_cnt : No 0 (read-only)
+Read requests started by upper layer.
+.It Va kern.geom.raid5.maxwql : No 0 (tunable)
+This variable gives a hint for the maximum length of the write queue.
+Write requests are queued until they are long enough or old enough or
+until there are too many of them.
+.It Va kern.geom.raid5.wdt : No 10 (tunable)
+This variable determines the maximum age of a write request before it
+is issued.
+.It Va kern.geom.raid5.tooc : No 3 (tunable)
+This variable determines the time-out-on-create. The provider is not
+created before all consumers are present or the timeout is over.
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, and 1 if the command fails.
+.Sh EXAMPLES
+The following example shows how to set up a RAID5 device from four disks with a
+128KB stripe size for automatic configuration,
+create a file system on it,
+and mount it:
+.Bd -literal -offset indent
+graid5 label -v -s 131072 data /dev/da0 /dev/da1 /dev/da2 /dev/da3
+newfs /dev/raid5/data
+mount /dev/raid5/data /mnt
+[...]
+umount /mnt
+graid5 stop data
+graid5 unload
+.Ed
+.Sh COMPATIBILITY
+The
+.Nm
+interleave is in number of bytes,
+unlike
+.Xr ccdconfig 8
+and
+.Xr atacontrol 8
+which use the number of sectors.
+A
+.Xr ccdconfig 8
+.Ar ileave
+of
+.Ql 128
+is 64 KB (128 512B sectors).
+The same stripe interleave would be specified as
+.Ql 65536
+for
+.Nm .
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr loader.conf 5 ,
+.Xr atacontrol 8 ,
+.Xr ccdconfig 8 ,
+.Xr geom 8 ,
+.Xr mount 8 ,
+.Xr newfs 8 ,
+.Xr sysctl 8 ,
+.Xr umount 8 ,
+.Xr vinum 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in
+.Fx 5.3 .
+.Sh AUTHORS
+.An Arne W?rner Aq arne_woerner at yahoo.com
+.An testing & tuning: Aq veronica at fluffles.net
Added: projects/geom_raid5/sys/geom/raid5/g_raid5.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/geom_raid5/sys/geom/raid5/g_raid5.c Mon Feb 2 20:51:26 2009 (r188025)
@@ -0,0 +1,4174 @@
+/*
+ * Copyright (c) 2006 Arne Woerner <arne_woerner at yahoo.com>
+ * testing + tuning-tricks: veronica at fluffles.net
+ * derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd at FreeBSD.org>)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$Id: g_raid5.c,v 1.271.1.274 2008/07/29 13:58:03 aw Exp aw $");
+
+#ifdef KASSERT
+#define MYKASSERT(a,b) KASSERT(a,b)
+#else
+#define MYKASSERT(a,b) do {if (!(a)) { G_RAID5_DEBUG(0,"KASSERT in line %d.",__LINE__); panic b;}} while (0)
+#endif
+#define ORDER(a,b) do {if (a > b) { int tmp = a; a = b; b = tmp; }} while(0)
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/eventhandler.h>
+#include <sys/sched.h>
+#include <geom/geom.h>
+#include <geom/raid5/g_raid5.h>
+
+/*
+ * our sysctl-s
+ */
+SYSCTL_DECL(_kern_geom);
+SYSCTL_NODE(_kern_geom, OID_AUTO, raid5, CTLFLAG_RW, 0, "GEOM_RAID5 stuff");
+static u_int g_raid5_cache_size_mem = 64*1024*1024;
+TUNABLE_INT("kern.geom.raid5.csm", &g_raid5_cache_size_mem);
+SYSCTL_INT(_kern_geom_raid5, OID_AUTO, csm, CTLFLAG_RW, &g_raid5_cache_size_mem,
+ 0, "cache size ((<disk count-1)*<stripe size> per bucket) in bytes");
+static int g_raid5_cache_size = -5;
+TUNABLE_INT("kern.geom.raid5.cs", &g_raid5_cache_size);
+SYSCTL_INT(_kern_geom_raid5, OID_AUTO, cs, CTLFLAG_RW, &g_raid5_cache_size,0,
+ "cache size ((<disk count-1)*<stripe size> per bucket)");
+static u_int g_raid5_debug = 0;
+TUNABLE_INT("kern.geom.raid5.debug", &g_raid5_debug);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, debug, CTLFLAG_RW, &g_raid5_debug, 0,
+ "Debug level");
+static u_int g_raid5_tooc = 5;
+TUNABLE_INT("kern.geom.raid5.tooc", &g_raid5_tooc);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, tooc, CTLFLAG_RW, &g_raid5_tooc, 0,
+ "timeout on create (in order to avoid unnecessary rebuilds on reboot)");
+static u_int g_raid5_wdt = 5;
+TUNABLE_INT("kern.geom.raid5.wdt", &g_raid5_wdt);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wdt, CTLFLAG_RW, &g_raid5_wdt, 0,
+ "write request delay (in seconds)");
+static u_int g_raid5_maxwql = 25;
+TUNABLE_INT("kern.geom.raid5.maxwql", &g_raid5_maxwql);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, maxwql, CTLFLAG_RW, &g_raid5_maxwql, 0,
+ "max wait queue length");
+static u_int g_raid5_veri_fac = 25;
+TUNABLE_INT("kern.geom.raid5.veri_fac", &g_raid5_veri_fac);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri_fac, CTLFLAG_RW, &g_raid5_veri_fac,
+ 0, "veri brake factor in case of veri_min * X < veri_max");
+static u_int g_raid5_veri_nice = 100;
+TUNABLE_INT("kern.geom.raid5.veri_nice", &g_raid5_veri_nice);
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO,veri_nice, CTLFLAG_RW,&g_raid5_veri_nice,
+ 0, "wait this many milli seconds after last user-read (less than 1sec)");
+static u_int g_raid5_vsc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri, CTLFLAG_RD, &g_raid5_vsc, 0,
+ "verify stripe count");
+static u_int g_raid5_vwc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri_w, CTLFLAG_RD, &g_raid5_vwc, 0,
+ "verify write count");
+static u_int g_raid5_rrc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, rreq_cnt, CTLFLAG_RD, &g_raid5_rrc, 0,
+ "read request count");
+static u_int g_raid5_wrc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq_cnt, CTLFLAG_RD, &g_raid5_wrc, 0,
+ "write request count");
+static u_int g_raid5_w1rc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq1_cnt, CTLFLAG_RD, &g_raid5_w1rc, 0,
+ "write request count (1-phase)");
+static u_int g_raid5_w2rc = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq2_cnt, CTLFLAG_RD, &g_raid5_w2rc, 0,
+ "write request count (2-phase)");
+static u_int g_raid5_disks_ok = 50;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, dsk_ok, CTLFLAG_RD, &g_raid5_disks_ok,0,
+ "repeat EIO'ed request?");
+static u_int g_raid5_blked1 = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, blked1, CTLFLAG_RD, &g_raid5_blked1,0,
+ "1. kind block count");
+static u_int g_raid5_blked2 = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, blked2, CTLFLAG_RD, &g_raid5_blked2,0,
+ "2. kind block count");
+static u_int g_raid5_wqp = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wqp, CTLFLAG_RD, &g_raid5_wqp,0,
+ "max. write queue length");
+static u_int g_raid5_mhm = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, mhm, CTLFLAG_RD, &g_raid5_mhm,0,
+ "memory hamster miss");
+static u_int g_raid5_mhh = 0;
+SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, mhh, CTLFLAG_RD, &g_raid5_mhh,0,
+ "memory hamster hit");
+
+static MALLOC_DEFINE(M_RAID5, "raid5_data", "GEOM_RAID5 Data");
+
+static int g_raid5_destroy(struct g_raid5_softc *sc,
+ boolean_t force, boolean_t noyoyo);
+static int g_raid5_destroy_geom(struct gctl_req *req, struct g_class *mp,
+ struct g_geom *gp);
+
+static g_taste_t g_raid5_taste;
+static g_ctl_req_t g_raid5_config;
+static g_dumpconf_t g_raid5_dumpconf;
+
+static eventhandler_tag g_raid5_post_sync = NULL;
+
+static void g_raid5_init(struct g_class *mp);
+static void g_raid5_fini(struct g_class *mp);
+
+struct g_class g_raid5_class = {
+ .name = G_RAID5_CLASS_NAME,
+ .version = G_VERSION,
+ .ctlreq = g_raid5_config,
+ .taste = g_raid5_taste,
+ .destroy_geom = g_raid5_destroy_geom,
+ .init = g_raid5_init,
+ .fini = g_raid5_fini
+};
+
+/* GCD & LCM */
+static __inline u_int
+gcd(u_int a, u_int b)
+{
+ while (b != 0) {
+ u_int c = a;
+ a = b;
+ b = c % b;
+ }
+ return a;
+}
+static __inline u_int
+g_raid5_lcm(u_int a, u_int b)
+{ return ((a * b) / gcd(a, b)); }
+
+/*
+ * memory hamster stuff
+ * memory hamster stores in the first sizeof(int) bytes of each chunk
+ * that is requested * by malloc() the size of that chunk,
+ * while the bio-s only see that chunk at offset &[sizeof(int)]...
+ */
+static __inline int
+g_raid5_mh_sz_by_a(caddr_t m)
+{ return ((int*)m)[-1]; }
+static __inline int
+g_raid5_mh_sz_by_i(struct g_raid5_softc *sc, int i)
+{ return g_raid5_mh_sz_by_a(sc->mhl[i]); }
+static __inline void
+g_raid5_mh_sz(caddr_t m, int l)
+{ ((int*)m)[-1] = l; }
+static __inline void
+g_raid5_free_by_a(caddr_t m)
+{ free(m - sizeof(int), M_RAID5); }
+static __inline void
+g_raid5_free_by_i(struct g_raid5_softc *sc, int mi)
+{ g_raid5_free_by_a(sc->mhl[mi]); }
+static void
+g_raid5_mh_all_free(struct g_raid5_softc *sc) {
+ for (int i=0; i<sc->mhc; i++)
+ g_raid5_free_by_i(sc,i);
+ sc->mhc = 0;
+}
+static caddr_t
+g_raid5_malloc(struct g_raid5_softc *sc, int l, int force)
+{
+ mtx_lock(&sc->mh_mtx);
+ int h = l*2;
+ int fi = -1;
+ int fl = -1;
+ int i;
+ for (i=0; i<sc->mhc; i++) {
+ int ml = g_raid5_mh_sz_by_i(sc,i);
+ if (ml < l || ml > h)
+ continue;
+ if (fl > 0 && ml >= fl)
+ continue;
+ fl = ml;
+ fi = i;
+ if (ml == l)
+ break;
+ }
+ caddr_t m;
+ if (fi >= 0) {
+ m = sc->mhl[fi];
+ sc->mhc--;
+ if (fi < sc->mhc)
+ sc->mhl[fi] = sc->mhl[sc->mhc];
+ g_raid5_mhh++;
+ mtx_unlock(&sc->mh_mtx);
+ } else {
+ g_raid5_mhm++;
+ mtx_unlock(&sc->mh_mtx);
+ m = malloc(l+sizeof(fl), M_RAID5, M_NOWAIT);
+ if (m == NULL && force) {
+ g_raid5_mh_all_free(sc);
+ m = malloc(l+sizeof(fl), M_RAID5, M_WAITOK);
+ }
+ if (m != NULL) {
+ m += sizeof(fl);
+ g_raid5_mh_sz(m,l);
+ }
+ }
+ return m;
+}
+static void
+g_raid5_free(struct g_raid5_softc *sc, caddr_t m)
+{
+ mtx_lock(&sc->mh_mtx);
+ MYKASSERT(((int*)m)[-1] > 0, ("this is no mem hamster chunk."));
+ if (sc->mhc < sc->mhs) {
+ sc->mhl[sc->mhc] = m;
+ sc->mhc++;
+ } else {
+ int l = g_raid5_mh_sz_by_a(m);
+ int mi = -1;
+ int ml = -1;
+ for (int i=0; i<sc->mhc; i++) {
+ int nl = g_raid5_mh_sz_by_i(sc,i);
+ if (nl >= l)
+ continue;
+ if (ml > 0 && ml <= nl)
+ continue;
+ mi = i;
+ ml = nl;
+ }
+ if (mi < 0)
+ g_raid5_free_by_a(m);
+ else {
+ g_raid5_free_by_i(sc,mi);
+ sc->mhl[mi] = m;
+ }
+ }
+ mtx_unlock(&sc->mh_mtx);
+}
+static void
+g_raid5_mh_destroy(struct g_raid5_softc *sc)
+{
+ g_raid5_mh_all_free(sc);
+ free(sc->mhl, M_RAID5);
+ mtx_destroy(&sc->mh_mtx);
+}
+
+/*
+ * cache entry manager
+ * implements a simple queue (fst; for next bio it (ab)uses bio's bio_queue)
+ */
+static __inline int
+g_raid5_ce_em(struct g_raid5_cache_entry *ce)
+{ return ce->fst == NULL; }
+static __inline struct g_raid5_cache_entry *
+g_raid5_ce_by_i(struct g_raid5_softc *sc, int i)
+{ return sc->ce + i; }
+static struct g_raid5_cache_entry *
+g_raid5_ce_by_sno(struct g_raid5_softc *sc, off_t s)
+{
+ struct g_raid5_cache_entry *fce = NULL;
+ MYKASSERT(s >= 0, ("s must not be negative."));
+ s++;
+ int i = s % sc->cs;
+ for (int j=sc->cs; j>0; j--) {
+ struct g_raid5_cache_entry *ce = g_raid5_ce_by_i(sc,i);
+ if (ce->sno == s)
+ return ce;
+ if (fce==NULL && ce->sno == 0)
+ fce = ce;
+ i++;
+ if (i == sc->cs)
+ i = 0;
+ }
+ if (fce == NULL) {
+ sc->cfc++;
+ return NULL;
+ }
+ MYKASSERT(fce->fst == NULL, ("ce not free."));
+ MYKASSERT(fce->dc == 0, ("%p dc inconsistency %d.",fce,fce->dc));
+ MYKASSERT(fce->sno == 0, ("ce not free."));
+ fce->sno = s;
+ return fce;
+}
+static __inline struct g_raid5_cache_entry *
+g_raid5_ce_by_off(struct g_raid5_softc *sc, off_t o)
+{ return g_raid5_ce_by_sno(sc, o/sc->fsl); }
+static __inline struct g_raid5_cache_entry *
+g_raid5_ce_by_bio(struct g_raid5_softc *sc, struct bio *bp)
+{ return g_raid5_ce_by_off(sc, bp->bio_offset); }
+#define G_RAID5_C_TRAVERSE(AAA,BBB,CCC) \
+ for (int i = AAA->cs-1; i >= 0; i--) \
+ G_RAID5_CE_TRAVERSE((CCC=g_raid5_ce_by_i(sc,i)), BBB)
+#define G_RAID5_C_TRAVSAFE(AAA,BBB,CCC) \
+ for (int i = AAA->cs-1; i >= 0; i--) \
+ G_RAID5_CE_TRAVSAFE((CCC=g_raid5_ce_by_i(sc,i)), BBB)
+#define G_RAID5_CE_TRAVERSE(AAA, BBB) \
+ for (BBB = AAA->fst; BBB != NULL; BBB = g_raid5_q_nx(BBB))
+#define G_RAID5_CE_TRAVSAFE(AAA, BBB) \
+ for (BBB = AAA->fst, BBB##_nxt = g_raid5_q_nx(BBB); \
+ BBB != NULL; \
+ BBB = BBB##_nxt, BBB##_nxt = g_raid5_q_nx(BBB))
+static __inline void
+g_raid5_dc_inc(struct g_raid5_softc *sc, struct g_raid5_cache_entry *ce)
+{
+ MYKASSERT(ce->dc >= 0 && sc->dc >= 0 && sc->wqp >= 0, ("cannot happen."));
+ if (ce->dc == 0)
+ sc->dc++;
+ ce->dc++;
+ sc->wqp++;
+}
+static __inline void
+g_raid5_dc_dec(struct g_raid5_softc *sc, struct g_raid5_cache_entry *ce)
+{
+ MYKASSERT(ce->dc > 0 && sc->dc > 0 && sc->wqp > 0, ("cannot happen."));
+ ce->dc--;
+ if (ce->dc == 0)
+ sc->dc--;
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-projects
mailing list