svn commit: r206497 - in head: sbin/geom/class sbin/geom/class/sched sys/geom/sched sys/modules/geom sys/modules/geom/geom_sched sys/modules/geom/geom_sched/gs_sched sys/modules/geom/geom_sched/gsc...

Tue Apr 13 19:37:18 UTC 2010

you dont seem to have commited the anticipatory scheduler gsched_as.

why?

On Mon, Apr 12, 2010 at 04:37:45PM +0000, Luigi Rizzo wrote:
> Author: luigi
> Date: Mon Apr 12 16:37:45 2010
> New Revision: 206497
> URL: http://svn.freebsd.org/changeset/base/206497
> 
> Log:
>   Bring in geom_sched, support for scheduling disk I/O requests
>   in a device independent manner. Also include an example anticipatory
>   scheduler, gsched_rr, which gives very nice performance improvements
>   in presence of competing random access patterns.
>   
>   This is joint work with Fabio Checconi, developed last year
>   and presented at BSDCan 2009. You can find details in the
>   README file or at
>   
>   http://info.iet.unipi.it/~luigi/geom_sched/
> 
> Added:
>   head/sbin/geom/class/sched/
>   head/sbin/geom/class/sched/Makefile   (contents, props changed)
>   head/sbin/geom/class/sched/geom_sched.c   (contents, props changed)
>   head/sbin/geom/class/sched/gsched.8   (contents, props changed)
>   head/sys/geom/sched/
>   head/sys/geom/sched/README   (contents, props changed)
>   head/sys/geom/sched/g_sched.c   (contents, props changed)
>   head/sys/geom/sched/g_sched.h   (contents, props changed)
>   head/sys/geom/sched/gs_rr.c   (contents, props changed)
>   head/sys/geom/sched/gs_scheduler.h   (contents, props changed)
>   head/sys/geom/sched/subr_disk.c   (contents, props changed)
>   head/sys/modules/geom/geom_sched/
>   head/sys/modules/geom/geom_sched/Makefile   (contents, props changed)
>   head/sys/modules/geom/geom_sched/Makefile.inc   (contents, props changed)
>   head/sys/modules/geom/geom_sched/gs_sched/
>   head/sys/modules/geom/geom_sched/gs_sched/Makefile   (contents, props changed)
>   head/sys/modules/geom/geom_sched/gsched_rr/
>   head/sys/modules/geom/geom_sched/gsched_rr/Makefile   (contents, props changed)
> Modified:
>   head/sbin/geom/class/Makefile
>   head/sys/modules/geom/Makefile
> 
> Modified: head/sbin/geom/class/Makefile
> ==============================================================================
> --- head/sbin/geom/class/Makefile	Mon Apr 12 13:46:20 2010	(r206496)
> +++ head/sbin/geom/class/Makefile	Mon Apr 12 16:37:45 2010	(r206497)
> @@ -15,6 +15,7 @@ SUBDIR+=multipath
>  SUBDIR+=nop
>  SUBDIR+=part
>  SUBDIR+=raid3
> +SUBDIR+=sched
>  SUBDIR+=shsec
>  SUBDIR+=stripe
>  SUBDIR+=virstor
> 
> Added: head/sbin/geom/class/sched/Makefile
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sbin/geom/class/sched/Makefile	Mon Apr 12 16:37:45 2010	(r206497)
> @@ -0,0 +1,19 @@
> +# GEOM_LIBRARY_PATH
> +# $FreeBSD$
> +
> +.PATH: /usr/src/sbin/geom/misc
> +
> +CFLAGS += -I/usr/src/sbin/geom
> +
> +CLASS=sched
> +
> +WARNS?= 6
> +CLASS_DIR?=/lib/geom
> +
> +SHLIBDIR?=${CLASS_DIR}
> +SHLIB_NAME?=geom_${CLASS}.so
> +LINKS=  ${BINDIR}/geom ${BINDIR}/g${CLASS}
> +MAN=    g${CLASS}.8
> +SRCS+=  geom_${CLASS}.c subr.c
> +
> +.include <bsd.lib.mk>
> 
> Added: head/sbin/geom/class/sched/geom_sched.c
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sbin/geom/class/sched/geom_sched.c	Mon Apr 12 16:37:45 2010	(r206497)
> @@ -0,0 +1,123 @@
> +/*-
> + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +/*
> + * $Id$
> + * $FreeBSD$
> + *
> + * This file implements the userspace library used by the 'geom'
> + * command to load and manipulate disk schedulers.
> + */
> +  
> +#include <sys/cdefs.h>
> +#include <sys/param.h>
> +#include <sys/linker.h>
> +#include <sys/module.h>
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <libgeom.h>
> +
> +#include "core/geom.h"
> +#include "misc/subr.h"
> +
> +#define	G_SCHED_VERSION	0
> +
> +uint32_t lib_version = G_LIB_VERSION;
> +uint32_t version = G_SCHED_VERSION;
> +
> +/*
> + * storage for parameters used by this geom class.
> + * Right now only the scheduler name is used.
> + */
> +static char algo[] = "rr";	/* default scheduler */
> +
> +/*
> + * Adapt to differences in geom library.
> + * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined
> + */
> +#if G_LIB_VERSION == 1
> +#define G_ARGNAME
> +#define G_TYPE_BOOL	G_TYPE_NUMBER
> +#else
> +#define G_ARGNAME	NULL,
> +#endif
> +
> +static void
> +gcmd_createinsert(struct gctl_req *req, unsigned flags __unused)
> +{
> +	const char *reqalgo;
> +	char name[64];
> +
> +	if (gctl_has_param(req, "algo"))
> +		reqalgo = gctl_get_ascii(req, "algo");
> +	else
> +		reqalgo = algo;
> +
> +	snprintf(name, sizeof(name), "gsched_%s", reqalgo);
> +	/*
> +	 * Do not complain about errors here, gctl_issue()
> +	 * will fail anyway.
> +	 */
> +	if (modfind(name) < 0)
> +		kldload(name);
> +	gctl_issue(req);
> +}
> +
> +struct g_command class_commands[] = {
> +	{ "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
> +	    {
> +		{ 'a', "algo", algo, G_TYPE_STRING },
> +		G_OPT_SENTINEL
> +	    },
> +	    G_ARGNAME "[-v] [-a algorithm_name] dev ..."
> +	},
> +	{ "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
> +	    {
> +		{ 'a', "algo", algo, G_TYPE_STRING },
> +		G_OPT_SENTINEL
> +	    },
> +	    G_ARGNAME "[-v] [-a algorithm_name] dev ..."
> +	},
> +	{ "configure", G_FLAG_VERBOSE, NULL,
> +	    {
> +		{ 'a', "algo", algo, G_TYPE_STRING },
> +		G_OPT_SENTINEL
> +	    },
> +	    G_ARGNAME "[-v] [-a algorithm_name] prov ..."
> +	},
> +	{ "destroy", G_FLAG_VERBOSE, NULL,
> +	    {
> +		{ 'f', "force", NULL, G_TYPE_BOOL },
> +		G_OPT_SENTINEL
> +	    },
> +	    G_ARGNAME "[-fv] prov ..."
> +	},
> +	{ "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
> +	    G_ARGNAME "[-v] prov ..."
> +	},
> +	G_CMD_SENTINEL
> +};
> 
> Added: head/sbin/geom/class/sched/gsched.8
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sbin/geom/class/sched/gsched.8	Mon Apr 12 16:37:45 2010	(r206497)
> @@ -0,0 +1,161 @@
> +.\" Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo
> +.\" All rights reserved.
> +.\" $FreeBSD$
> +.\"
> +.\" Redistribution and use in source and binary forms, with or without
> +.\" modification, are permitted provided that the following conditions
> +.\" are met:
> +.\" 1. Redistributions of source code must retain the above copyright
> +.\"    notice, this list of conditions and the following disclaimer.
> +.\" 2. Redistributions in binary form must reproduce the above copyright
> +.\"    notice, this list of conditions and the following disclaimer in the
> +.\"    documentation and/or other materials provided with the distribution.
> +.\"
> +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
> +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> +.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
> +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> +.\" SUCH DAMAGE.
> +.\"
> +.Dd April 12, 2010
> +.Dt GSCHED 8
> +.Os
> +.Sh NAME
> +.Nm gsched
> +.Nd "control utility for disk scheduler GEOM class"
> +.Sh SYNOPSIS
> +.Nm
> +.Cm create
> +.Op Fl v
> +.Op Fl a Ar algorithm
> +.Ar provider ...
> +.Nm
> +.Cm insert
> +.Op Fl v
> +.Op Fl a Ar algorithm
> +.Ar provider ...
> +.Nm
> +.Cm configure
> +.Op Fl v
> +.Op Fl a Ar algorithm
> +.Ar node ...
> +.Nm
> +.Cm destroy
> +.Op Fl fv
> +.Ar node ...
> +.Nm
> +.Cm reset
> +.Op Fl v
> +.Ar node ...
> +.Nm
> +.Cm { list | status | load | unload }
> +.Sh DESCRIPTION
> +The
> +.Nm
> +utility (also callable as
> +.Nm geom sched ... )
> +changes the scheduling policy of the requests going to a provider.
> +.Pp
> +The first argument to
> +.Nm
> +indicates an action to be performed:
> +.Bl -tag -width ".Cm configure"
> +.It Cm create
> +Create a new provider and geom node using the specified scheduling algorithm.
> +.Ar algorithm
> +is the name of the scheduling algorithm used for the provider.
> +Available algorithms include:
> +.Ar rr ,
> +which implements anticipatory scheduling with round robin service
> +among clients;
> +.Ar as ,
> +which implements a simple form of anticipatory scheduling with
> +no per-client queue.
> +.Pp
> +If the operation succeeds, the new provider should appear with name
> +.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. .
> +The kernel module
> +.Pa geom_sched.ko
> +will be loaded if it is not loaded already.
> +.It Cm insert
> +Operates as "create", but the insertion is "transparent",
> +i.e. the existing provider is rerouted to the newly created geom,
> +which in turn forwards requests to the existing geom.
> +This operation allows one to start/stop a scheduling service
> +on an already existing provider.
> +.Pp
> +A subsequent 'destroy' will remove the newly created geom and
> +hook the provider back to the original geom.
> +.Ar algorithm
> +.It Cm configure
> +Configure existing scheduling provider.  It supports the same options
> +as the 
> +.Nm create
> +command.
> +.It Cm destroy
> +Destroy the geom specified in the parameter.
> +.It Cm reset
> +Do nothing.
> +.It Cm list | status | load | unload
> +See
> +.Xr geom 8 .
> +.El
> +.Pp
> +Additional options:
> +.Bl -tag -width ".Fl f"
> +.It Fl f
> +Force the removal of the specified provider.
> +.It Fl v
> +Be more verbose.
> +.El
> +.Sh SYSCTL VARIABLES
> +The following
> +.Xr sysctl 8
> +variables can be used to control the behavior of the
> +.Nm SCHED
> +GEOM class.
> +The default value is shown next to each variable.
> +.Bl -tag -width indent
> +.It Va kern.geom.sched.debug : No 0
> +Debug level of the
> +.Nm SCHED
> +GEOM class.
> +This can be set to a number between 0 and 2 inclusive.
> +If set to 0 minimal debug information is printed, and if set to 2 the
> +maximum amount of debug information is printed.
> +.El
> +.Sh EXIT STATUS
> +Exit status is 0 on success, and 1 if the command fails.
> +.Sh EXAMPLES
> +The following example shows how to create a scheduling provider for disk
> +.Pa /dev/da0
> +, and how to destroy it.
> +.Bd -literal -offset indent
> +# Load the geom_sched module:
> +kldload geom_sched
> +# Load some scheduler classes used by geom_sched:
> +kldload gsched_rr gsched_as
> +# Configure device ad0 to use scheduler 'rr':
> +geom sched insert -s rr ad0
> +# Now provider ad0 uses the 'rr' algorithm;
> +# the new geom is ad0.sched.
> +# Remove the scheduler on the device:
> +geom sched destroy -v ad0.sched.
> +.Ed
> +.Pp
> +.Sh SEE ALSO
> +.Xr geom 4 ,
> +.Xr geom 8
> +.Sh HISTORY
> +The
> +.Nm
> +utility appeared in April 2010.
> +.Sh AUTHORS
> +.An Fabio Checconi Aq fabio at FreeBSD.org
> +.An Luigi Rizzo Aq luigi at FreeBSD.org
> 
> Added: head/sys/geom/sched/README
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sys/geom/sched/README	Mon Apr 12 16:37:45 2010	(r206497)
> @@ -0,0 +1,162 @@
> +
> +	--- GEOM BASED DISK SCHEDULERS FOR FREEBSD ---
> +
> +This code contains a framework for GEOM-based disk schedulers and a
> +couple of sample scheduling algorithms that use the framework and
> +implement two forms of "anticipatory scheduling" (see below for more
> +details).
> +
> +As a quick example of what this code can give you, try to run "dd",
> +"tar", or some other program with highly SEQUENTIAL access patterns,
> +together with "cvs", "cvsup", "svn" or other highly RANDOM access patterns
> +(this is not a made-up example: it is pretty common for developers
> +to have one or more apps doing random accesses, and others that do
> +sequential accesses e.g., loading large binaries from disk, checking
> +the integrity of tarballs, watching media streams and so on).
> +
> +These are the results we get on a local machine (AMD BE2400 dual
> +core CPU, SATA 250GB disk):
> +
> +    /mnt is a partition mounted on /dev/ad0s1f
> +
> +    cvs: 	cvs -d /mnt/home/ncvs-local update -Pd /mnt/ports
> +    dd-read:	dd bs=128k of=/dev/null if=/dev/ad0 (or ad0-sched-)
> +    dd-writew	dd bs=128k if=/dev/zero of=/mnt/largefile
> +
> +			NO SCHEDULER		RR SCHEDULER
> +                	dd	cvs		dd	cvs
> +
> +    dd-read only        72 MB/s	----		72 MB/s	---
> +    dd-write only	55 MB/s	---		55 MB/s	---
> +    dd-read+cvs		 6 MB/s	ok    		30 MB/s	ok
> +    dd-write+cvs	55 MB/s slooow		14 MB/s	ok
> +
> +As you can see, when a cvs is running concurrently with dd, the
> +performance drops dramatically, and depending on read or write mode,
> +one of the two is severely penalized.  The use of the RR scheduler
> +in this example makes the dd-reader go much faster when competing
> +with cvs, and lets cvs progress when competing with a writer.
> +
> +To try it out:
> +
> +1. USERS OF FREEBSD 7, PLEASE READ CAREFULLY THE FOLLOWING:
> +
> +    On loading, this module patches one kernel function (g_io_request())
> +    so that I/O requests ("bio's") carry a classification tag, useful
> +    for scheduling purposes.
> +
> +    ON FREEBSD 7, the tag is stored in an existing (though rarely used)
> +    field of the "struct bio", a solution which makes this module
> +    incompatible with other modules using it, such as ZFS and gjournal.
> +    Additionally, g_io_request() is patched in-memory to add a call
> +    to the function that initializes this field (i386/amd64 only;
> +    for other architectures you need to manually patch sys/geom/geom_io.c).
> +    See details in the file g_sched.c.
> +
> +    On FreeBSD 8.0 and above, the above trick is not necessary,
> +    as the struct bio contains dedicated fields for the classifier,
> +    and hooks for request classifiers.
> +
> +    If you don't like the above, don't run this code.
> +
> +2. PLEASE MAKE SURE THAT THE DISK THAT YOU WILL BE USING FOR TESTS
> +   DOES NOT CONTAIN PRECIOUS DATA.
> +    This is experimental code, so we make no guarantees, though
> +    I am routinely using it on my desktop and laptop.
> +
> +3. EXTRACT AND BUILD THE PROGRAMS
> +    A 'make install' in the directory should work (with root privs),
> +    or you can even try the binary modules.
> +    If you want to build the modules yourself, look at the Makefile.
> +
> +4. LOAD THE MODULE, CREATE A GEOM NODE, RUN TESTS
> +
> +    The scheduler's module must be loaded first:
> +
> +      # kldload gsched_rr
> +
> +    substitute with gsched_as to test AS.  Then, supposing that you are
> +    using /dev/ad0 for testing, a scheduler can be attached to it with:
> +
> +      # geom sched insert ad0
> +
> +    The scheduler is inserted transparently in the geom chain, so
> +    mounted partitions and filesystems will keep working, but
> +    now requests will go through the scheduler.
> +
> +    To change scheduler on-the-fly, you can reconfigure the geom:
> +
> +      # geom sched configure -a as ad0.sched.
> +
> +    assuming that gsched_as was loaded previously.
> +
> +5. SCHEDULER REMOVAL
> +
> +    In principle it is possible to remove the scheduler module
> +    even on an active chain by doing
> +
> +	# geom sched destroy ad0.sched.
> +
> +    However, there is some race in the geom subsystem which makes
> +    the removal unsafe if there are active requests on a chain.
> +    So, in order to reduce the risk of data losses, make sure
> +    you don't remove a scheduler from a chain with ongoing transactions.
> +
> +--- NOTES ON THE SCHEDULERS ---
> +
> +The important contribution of this code is the framework to experiment
> +with different scheduling algorithms.  'Anticipatory scheduling'
> +is a very powerful technique based on the following reasoning:
> +
> +    The disk throughput is much better if it serves sequential requests.
> +    If we have a mix of sequential and random requests, and we see a
> +    non-sequential request, do not serve it immediately but instead wait
> +    a little bit (2..5ms) to see if there is another one coming that
> +    the disk can serve more efficiently.
> +
> +There are many details that should be added to make sure that the
> +mechanism is effective with different workloads and systems, to
> +gain a few extra percent in performance, to improve fairness,
> +insulation among processes etc.  A discussion of the vast literature
> +on the subject is beyond the purpose of this short note.
> +
> +--------------------------------------------------------------------------
> +
> +TRANSPARENT INSERT/DELETE
> +
> +geom_sched is an ordinary geom module, however it is convenient
> +to plug it transparently into the geom graph, so that one can
> +enable or disable scheduling on a mounted filesystem, and the
> +names in /etc/fstab do not depend on the presence of the scheduler.
> +
> +To understand how this works in practice, remember that in GEOM
> +we have "providers" and "geom" objects.
> +Say that we want to hook a scheduler on provider "ad0",
> +accessible through pointer 'pp'. Originally, pp is attached to
> +geom "ad0" (same name, different object) accessible through pointer old_gp
> +
> +  BEFORE	---> [ pp    --> old_gp ...]
> +
> +A normal "geom sched create ad0" call would create a new geom node
> +on top of provider ad0/pp, and export a newly created provider
> +("ad0.sched." accessible through pointer newpp).
> +
> +  AFTER create  ---> [ newpp --> gp --> cp ] ---> [ pp    --> old_gp ... ]
> +
> +On top of newpp, a whole tree will be created automatically, and we
> +can e.g. mount partitions on /dev/ad0.sched.s1d, and those requests
> +will go through the scheduler, whereas any partition mounted on
> +the pre-existing device entries will not go through the scheduler.
> +
> +With the transparent insert mechanism, the original provider "ad0"/pp
> +is hooked to the newly created geom, as follows:
> +
> +  AFTER insert  ---> [ pp    --> gp --> cp ] ---> [ newpp --> old_gp ... ]
> +
> +so anything that was previously using provider pp will now have
> +the requests routed through the scheduler node.
> +
> +A removal ("geom sched destroy ad0.sched.") will restore the original
> +configuration.
> +
> +# $FreeBSD$
> 
> Added: head/sys/geom/sched/g_sched.c
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sys/geom/sched/g_sched.c	Mon Apr 12 16:37:45 2010	(r206497)
> @@ -0,0 +1,1901 @@
> +/*-
> + * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +/*
> + * $Id$
> + * $FreeBSD$
> + *
> + * Main control module for geom-based disk schedulers ('sched').
> + *
> + * USER VIEW
> + * A 'sched' node is typically inserted transparently between
> + * an existing provider pp and its original geom gp
> + *
> + *	[pp --> gp  ..]
> + *
> + * using the command "geom sched insert <provider>" and
> + * resulting in the following topology
> + *
> + *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
> + *
> + * Deletion "geom sched destroy <provider>.sched." restores the
> + * original chain. The normal "geom sched create <provide>"
> + * is also supported.
> + *
> + * INTERNALS
> + * Internally, the 'sched' uses the following data structures
> + *
> + *   geom{}         g_sched_softc{}      g_gsched{}
> + * +----------+    +---------------+   +-------------+
> + * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
> + * |  ...     |    |               |   |  gs_fini    |
> + * |          |    | [ hash table] |   |  gs_start   |
> + * +----------+    |               |   |  ...        |
> + *                 |               |   +-------------+
> + *                 |               |
> + *                 |               |     g_*_softc{}
> + *                 |               |   +-------------+
> + *                 | sc_data     *-|-->|             |
> + *                 +---------------+   |  algorithm- |
> + *                                     |  specific   |
> + *                                     +-------------+
> + *
> + * A g_sched_softc{} is created with a "geom sched insert" call.
> + * In turn this instantiates a specific scheduling algorithm,
> + * which sets sc_gsched to point to the algorithm callbacks,
> + * and calls gs_init() to create the g_*_softc{} .
> + * The other callbacks (gs_start, gs_next, ...) are invoked
> + * as needed 
> + *
> + * g_sched_softc{} is defined in g_sched.h and mostly used here;
> + * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
> + * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
> + *
> + * DATA MOVING
> + * When a bio is received on the provider, it goes to the
> + * g_sched_start() which calls gs_start() to initially queue it;
> + * then we call g_sched_dispatch() that loops around gs_next()
> + * to select zero or more bio's to be sent downstream.
> + *
> + * g_sched_dispatch() can also be called as a result of a timeout,
> + * e.g. when doing anticipation or pacing requests.
> + *
> + * When a bio comes back, it goes to g_sched_done() which in turn
> + * calls gs_done(). The latter does any necessary housekeeping in
> + * the scheduling algorithm, and may decide to call g_sched_dispatch()
> + * to send more bio's downstream.
> + *
> + * If an algorithm needs per-flow queues, these are created
> + * calling gs_init_class() and destroyed with gs_fini_class(),
> + * and they are also inserted in the hash table implemented in
> + * the g_sched_softc{}
> + *
> + * If an algorithm is replaced, or a transparently-inserted node is
> + * removed with "geom sched destroy", we need to remove all references
> + * to the g_*_softc{} and g_sched_softc from the bio's still in
> + * the scheduler. g_sched_forced_dispatch() helps doing this.
> + * XXX need to explain better.
> + */
> +
> +#include <sys/cdefs.h>
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/kernel.h>
> +#include <sys/module.h>
> +#include <sys/lock.h>
> +#include <sys/mutex.h>
> +#include <sys/bio.h>
> +#include <sys/limits.h>
> +#include <sys/hash.h>
> +#include <sys/sysctl.h>
> +#include <sys/malloc.h>
> +#include <sys/proc.h>		/* we access curthread */
> +#include <geom/geom.h>
> +#include "gs_scheduler.h"
> +#include "g_sched.h"		/* geom hooks */
> +
> +/*
> + * Size of the per-geom hash table storing traffic classes.
> + * We may decide to change it at a later time, it has no ABI
> + * implications as it is only used for run-time allocations.
> + */
> +#define G_SCHED_HASH_SIZE	32
> +
> +static int g_sched_destroy(struct g_geom *gp, boolean_t force);
> +static int g_sched_destroy_geom(struct gctl_req *req,
> +    struct g_class *mp, struct g_geom *gp);
> +static void g_sched_config(struct gctl_req *req, struct g_class *mp,
> +    const char *verb);
> +static struct g_geom *g_sched_taste(struct g_class *mp,
> +    struct g_provider *pp, int flags __unused);
> +static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
> +    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
> +static void g_sched_init(struct g_class *mp);
> +static void g_sched_fini(struct g_class *mp);
> +
> +struct g_class g_sched_class = {
> +	.name = G_SCHED_CLASS_NAME,
> +	.version = G_VERSION,
> +	.ctlreq = g_sched_config,
> +	.taste = g_sched_taste,
> +	.destroy_geom = g_sched_destroy_geom,
> +	.init = g_sched_init,
> +	.fini = g_sched_fini
> +};
> +
> +MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
> +
> +/*
> + * Global variables describing the state of the geom_sched module.
> + * There is only one static instance of this structure.
> + */
> +LIST_HEAD(gs_list, g_gsched);	/* type, link field */
> +struct geom_sched_vars {
> +	struct mtx	gs_mtx;
> +	struct gs_list	gs_scheds;	/* list of algorithms */
> +	u_int		gs_debug;
> +	u_int		gs_sched_count;	/* how many algorithms ? */
> +	u_int 		gs_patched;	/* g_io_request was patched */
> +
> +	u_int		gs_initialized;
> +	u_int		gs_expire_secs;	/* expiration of hash entries */
> +
> +	struct bio_queue_head gs_pending;
> +	u_int		gs_npending;
> +
> +	/* The following are for stats, usually protected by gs_mtx. */
> +	u_long		gs_requests;	/* total requests */
> +	u_long		gs_done;	/* total done */
> +	u_int 		gs_in_flight;	/* requests in flight */
> +	u_int 		gs_writes_in_flight;
> +	u_int 		gs_bytes_in_flight;
> +	u_int 		gs_write_bytes_in_flight;
> +
> +	char		gs_names[256];	/* names of schedulers */
> +};
> +
> +static struct geom_sched_vars me = {
> +	.gs_expire_secs = 10,
> +};
> +
> +SYSCTL_DECL(_kern_geom);
> +SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
> +    "GEOM_SCHED stuff");
> +
> +SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
> +    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
> +
> +SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
> +    &me.gs_bytes_in_flight, 0, "Bytes in flight");
> +
> +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
> +    &me.gs_writes_in_flight, 0, "Write Requests in flight");
> +
> +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
> +    &me.gs_in_flight, 0, "Requests in flight");
> +
> +SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
> +    &me.gs_done, 0, "Total done");
> +
> +SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
> +    &me.gs_requests, 0, "Total requests");
> +
> +SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
> +    &me.gs_names, 0, "Algorithm names");
> +
> +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
> +    &me.gs_sched_count, 0, "Number of algorithms");
> +
> +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
> +    &me.gs_debug, 0, "Debug level");
> +
> +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
> +    &me.gs_expire_secs, 0, "Expire time in seconds");
> +
> +/*
> + * g_sched calls the scheduler algorithms with this lock held.
> + * The locking functions are exposed so the scheduler algorithms can also
> + * protect themselves e.g. when running a callout handler.
> + */
> +void
> +g_sched_lock(struct g_geom *gp)
> +{
> +	struct g_sched_softc *sc = gp->softc;
> +
> +	mtx_lock(&sc->sc_mtx);
> +}
> +
> +void
> +g_sched_unlock(struct g_geom *gp)
> +{
> +	struct g_sched_softc *sc = gp->softc;
> +
> +	mtx_unlock(&sc->sc_mtx);
> +}
> +
> +/*
> + * Support functions to handle references to the module,
> + * which are coming from devices using this scheduler.
> + */
> +static inline void
> +g_gsched_ref(struct g_gsched *gsp)
> +{
> +
> +	atomic_add_int(&gsp->gs_refs, 1);
> +}
> +
> +static inline void
> +g_gsched_unref(struct g_gsched *gsp)
> +{
> +
> +	atomic_add_int(&gsp->gs_refs, -1);
> +}
> +
> +/*
> + * Update the stats when this request is done.
> + */
> +static void
> +g_sched_update_stats(struct bio *bio)
> +{
> +
> +	me.gs_done++;
> +	me.gs_in_flight--;
> +	me.gs_bytes_in_flight -= bio->bio_length;
> +	if (bio->bio_cmd & BIO_WRITE) {
> +		me.gs_writes_in_flight--;
> +		me.gs_write_bytes_in_flight -= bio->bio_length;
> +	}
> +}
> +
> +/*
> + * Dispatch any pending request.
> + */
> +static void
> +g_sched_forced_dispatch(struct g_geom *gp)
> +{
> +	struct g_sched_softc *sc = gp->softc;
> +	struct g_gsched *gsp = sc->sc_gsched;
> +	struct bio *bp;
> +
> +	KASSERT(mtx_owned(&sc->sc_mtx),
> +	    ("sc_mtx not owned during forced dispatch"));
> +
> +	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
> +		g_io_request(bp, LIST_FIRST(&gp->consumer));
> +}
> +
> +/*
> + * The main dispatch loop, called either here after the start
> + * routine, or by scheduling algorithms when they receive a timeout
> + * or a 'done' notification.  Does not share code with the forced
> + * dispatch path, since the gs_done() callback can call us.
> + */
> +void
> +g_sched_dispatch(struct g_geom *gp)
> +{
> +	struct g_sched_softc *sc = gp->softc;
> +	struct g_gsched *gsp = sc->sc_gsched;
> +	struct bio *bp;
> +
> +	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
> +
> +	if ((sc->sc_flags & G_SCHED_FLUSHING))
> +		return;
> +
> +	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
> +		g_io_request(bp, LIST_FIRST(&gp->consumer));
> +}
> +
> +/*
> + * Recent (8.0 and above) versions of FreeBSD have support to
> + * register classifiers of disk requests. The classifier is
> + * invoked by g_io_request(), and stores the information into
> + * bp->bio_classifier1.
> + *
> + * Support for older versions, which is left here only for
> + * documentation purposes, relies on two hacks:
> + * 1. classification info is written into the bio_caller1
> + *    field of the topmost node in the bio chain. This field
> + *    is rarely used, but this module is incompatible with
> + *    those that use bio_caller1 for other purposes,
> + *    such as ZFS and gjournal;
> + * 2. g_io_request() is patched in-memory when the module is
> + *    loaded, so that the function calls a classifier as its
> + *    first thing. g_io_request() is restored when the module
> + *    is unloaded. This functionality is only supported for
> + *    x86 and amd64, other architectures need source code changes.
> + */
> +
> +/*
> + * Lookup the identity of the issuer of the original request.
> + * In the current implementation we use the curthread of the
> + * issuer, but different mechanisms may be implemented later
> + * so we do not make assumptions on the return value which for
> + * us is just an opaque identifier.
> + */
> +
> +static inline u_long
> +g_sched_classify(struct bio *bp)
> +{
> +
> +#if __FreeBSD_version > 800098
> +	/* we have classifier fields in the struct bio */
> +#define HAVE_BIO_CLASSIFIER
> +	return ((u_long)bp->bio_classifier1);
> +#else
> +#warning old version!!!
> +	while (bp->bio_parent != NULL)
> +		bp = bp->bio_parent;
> +
> +	return ((u_long)bp->bio_caller1);
> +#endif
> +}
> +
> +/* Return the hash chain for the given key. */
> +static inline struct g_hash *
> +g_sched_hash(struct g_sched_softc *sc, u_long key)
> +{
> +
> +	return (&sc->sc_hash[key & sc->sc_mask]);
> +}
> +
> +/*
> + * Helper function for the children classes, which takes
> + * a geom and a bio and returns the private descriptor
> + * associated to the request.  This involves fetching
> + * the classification field and [al]locating the
> + * corresponding entry in the hash table.
> + */
> +void *
> +g_sched_get_class(struct g_geom *gp, struct bio *bp)
> +{
> +	struct g_sched_softc *sc;
> +	struct g_sched_class *gsc;
> +	struct g_gsched *gsp;
> +	struct g_hash *bucket;
> +	u_long key;
> +
> +	sc = gp->softc;
> +	key = g_sched_classify(bp);
> +	bucket = g_sched_hash(sc, key);
> +	LIST_FOREACH(gsc, bucket, gsc_clist) {
> +		if (key == gsc->gsc_key) {
> +			gsc->gsc_refs++;
> +			return (gsc->gsc_priv);
> +		}
> +	}
> +
> +	gsp = sc->sc_gsched;
> +	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
> +	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
> +	if (!gsc)
> +		return (NULL);
> +
> +	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
> +		free(gsc, M_GEOM_SCHED);
> +		return (NULL);
> +	}
> +
> +	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
> +	gsc->gsc_key = key;
> +	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
> +
> +	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
> +
> +	return (gsc->gsc_priv);
> +}
> +
> +/*
> + * Release a reference to the per-client descriptor,
> + */
> +void
> +g_sched_put_class(struct g_geom *gp, void *priv)
> +{
> +	struct g_sched_class *gsc;
> +	struct g_sched_softc *sc;
> +
> +	gsc = g_sched_priv2class(priv);
> +	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
> +
> +	if (--gsc->gsc_refs > 0)
> +		return;
> +
> +	sc = gp->softc;
> +	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
> +
> +	LIST_REMOVE(gsc, gsc_clist);
> +	free(gsc, M_GEOM_SCHED);
> +}
> +
> +static void
> +g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
> +    struct g_gsched *gsp, void *data)
> +{
> +	struct g_sched_class *cp, *cp2;
> +	int i;
> +
> +	if (!hp)
> +		return;
> +
> +	if (data && gsp->gs_hash_unref)
> +		gsp->gs_hash_unref(data);
> +
> +	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
> +		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
> +			g_sched_put_class(gp, cp->gsc_priv);
> +	}
> +
> +	hashdestroy(hp, M_GEOM_SCHED, mask);
> +}
> +
> +static struct g_hash *
> +g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
> +{
> +	struct g_hash *hash;
> +
> +	if (gsp->gs_priv_size == 0)
> +		return (NULL);
> +
> +	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
> +
> +	return (hash);
> +}
> +
> +static void
> +g_sched_flush_classes(struct g_geom *gp)
> +{
> +	struct g_sched_softc *sc;
> +	struct g_sched_class *cp, *cp2;
> +	int i;
> +
> +	sc = gp->softc;
> +
> +	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
> +		return;
> +
> +	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
> +		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
> +			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
> +				g_sched_put_class(gp, cp->gsc_priv);
> +		}
> +	}
> +
> +	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
> +}
> +
> +/*
> + * Wait for the completion of any outstanding request.  To ensure
> + * that this does not take forever the caller has to make sure that
> + * no new request enter the scehduler before calling us.
> + *
> + * Must be called with the gp mutex held and topology locked.
> + */
> +static int
> +g_sched_wait_pending(struct g_geom *gp)
> +{
> +	struct g_sched_softc *sc = gp->softc;
> +	int endticks = ticks + hz;
> +
> +	g_topology_assert();
> +
> +	while (sc->sc_pending && endticks - ticks >= 0)
> +		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
> +
> +	return (sc->sc_pending ? ETIMEDOUT : 0);
> +}
> +
> +static int
> 
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***