kern/169403: CAM layer, I/O starvation, no fairness

jerry Toung jtoung at opnet.com
Mon Jun 25 17:50:07 UTC 2012


>Number:         169403
>Category:       kern
>Synopsis:       CAM layer, I/O starvation, no fairness
>Confidential:   no
>Severity:       serious
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Mon Jun 25 17:50:06 UTC 2012
>Closed-Date:
>Last-Modified:
>Originator:     jerry Toung
>Release:        FreeBSD 8.1-RELEASE #0
>Organization:
Opnet
>Environment:
FreeBSD dev8 8.1-RELEASE FreeBSD 8.1-RELEASE #0: Mon Jul 19 02:36:49 UTC 2010     root at mason.cse.buffalo.edu:/usr/obj/usr/src/sys/GENERIC  amd64

>Description:
I am convinced that there is a bug in the CAM code that leads to I/O starvation.
I have already discussed this privately with some. I am now bringing this up to
the general audience to get more feedback.

My setup is that I have 1 RAID controller with 2 arrays connected to
it, da0 and da1.
The controller supports 252 tags. After boot up, camcontrol tags on
da0 and da1 shows that both devices have 252 openings each. A process
P0 writing on da0 is dormant most of the time, but would wake up with
burst of I/Os, 5000-6000 ops as reported by gstat.
A process P1 writing on da1 has a fixed data rate to da1 as reported by gstat.

The issue: When P0 generates that burst of 5000-6000 ops, the write
rate of P1 on da1 goes to 0 MB/sec for up to 8-9sec,
vfs.hirunningspace starts climbing and we get into waithirunning() or
getblk() sleep channel. BTW, raising hirunningspace has no effect on
the 0 MB/s behavior.

The first problem that I see here, is that if the sim's devq has 252
alloc_queue and send_queue, the struct cam_ed representing da0 and da1 should each have 126 openings and not 252. 
The second problem is that clearly, there is no I/O fairness in CAM as seen
in gstat output and da0 exclusively takes a hold of the sim/controller
until it has processed all it's I/Os (8-9 seconds). The code that does
this is at

cam/cam_xpt.c:3030
3030             && (devq->alloc_openings > 0)

and

cam/cam_xpt.c:3091
3091             && (devq->send_openings > 0)

After you've split the openings to 126 each, the tests above will always be true

I have a patch and it fixes those problems. 

da0 and da1 now both automatically get 126 openings and based on that,
extra logic implements fairness in cam/cam_xpt.c. No more 0 MB/s on
da1. This is on 8.1-RELEASE FreeBSD.
>How-To-Repeat:

>Fix:


Patch attached with submission follows:

diff -rup cam/cam_queue.c cam/cam_queue.c
--- cam/cam_queue.c	2010-06-13 19:09:06.000000000 -0700
+++ cam/cam_queue.c	2012-03-22 10:50:50.000000000 -0700
@@ -412,7 +412,7 @@ heap_down(cam_pinfo **queue_array, int i
 {
 	int child;
 	int parent;
-	
+
 	parent = index;
 	child = parent << 1;
 	for (; child <= num_entries; child = parent << 1) {
diff -rup cam/cam_sim.c cam/cam_sim.c
--- cam/cam_sim.c	2010-06-13 19:09:06.000000000 -0700
+++ cam/cam_sim.c	2012-03-19 13:05:10.000000000 -0700
@@ -87,6 +87,7 @@ cam_sim_alloc(sim_action_func sim_action
 	sim->refcount = 1;
 	sim->devq = queue;
 	sim->max_ccbs = 8;	/* Reserve for management purposes. */
+	sim->dev_count = 0;	
 	sim->mtx = mtx;
 	if (mtx == &Giant) {
 		sim->flags |= 0;
diff -rup cam/cam_sim.h cam/cam_sim.h
--- cam/cam_sim.h	2010-06-13 19:09:06.000000000 -0700
+++ cam/cam_sim.h	2012-03-19 15:34:17.000000000 -0700
@@ -118,6 +118,8 @@ struct cam_sim {
 	u_int			max_ccbs;
 	/* Current count of allocated ccbs */
 	u_int			ccb_count;
+	/* Number of peripheral drivers mapped to this sim */
+	u_int			dev_count;	
 
 };
 
diff -rup cam/cam_xpt.c cam/cam_xpt.c
--- cam/cam_xpt.c	2010-06-13 19:09:06.000000000 -0700
+++ cam/cam_xpt.c	2012-03-29 11:41:51.000000000 -0700
@@ -303,7 +303,7 @@ xpt_schedule_dev_allocq(struct cam_eb *b
 	int retval;
 
 	if ((dev->drvq.entries > 0) &&
-	    (dev->ccbq.devq_openings > 0) &&
+	    (dev->runs_token < dev->ccbq.queue.array_size) &&
 	    (cam_ccbq_frozen(&dev->ccbq, CAM_PRIORITY_TO_RL(
 		CAMQ_GET_PRIO(&dev->drvq))) == 0)) {
 		/*
@@ -327,7 +327,7 @@ xpt_schedule_dev_sendq(struct cam_eb *bu
 	int	retval;
 
 	if ((dev->ccbq.queue.entries > 0) &&
-	    (dev->ccbq.dev_openings > 0) &&
+	    (dev->runs_token < dev->ccbq.queue.array_size) &&
 	    (cam_ccbq_frozen_top(&dev->ccbq) == 0)) {
 		/*
 		 * The priority of a device waiting for controller
@@ -973,6 +973,9 @@ xpt_add_periph(struct cam_periph *periph
 	struct cam_ed *device;
 	int32_t	 status;
 	struct periph_list *periph_head;
+	struct cam_eb *bus;
+	struct cam_et *target;
+	struct cam_ed *devptr;
 
 	mtx_assert(periph->sim->mtx, MA_OWNED);
 
@@ -991,6 +994,8 @@ xpt_add_periph(struct cam_periph *periph
 		status = camq_resize(&device->drvq,
 				     device->drvq.array_size + 1);
 
+		if (periph->periph_name != NULL &&  strncmp(periph->periph_name, "da",2) ==0 )
+			device->sim->dev_count++;
 		device->generation++;
 
 		SLIST_INSERT_HEAD(periph_head, periph, periph_links);
@@ -998,6 +1003,24 @@ xpt_add_periph(struct cam_periph *periph
 
 	mtx_lock(&xsoftc.xpt_topo_lock);
 	xsoftc.xpt_generation++;
+
+	if (device != NULL && device->sim->dev_count > 1 &&
+            (device->sim->max_dev_openings > device->sim->dev_count)) {
+		TAILQ_FOREACH(bus, &xsoftc.xpt_busses, links) {
+			if (bus->sim != device->sim)
+				continue;
+			TAILQ_FOREACH(target, &bus->et_entries, links) {
+				TAILQ_FOREACH(devptr, &target->ed_entries, links) {
+				/*
+		 		 * The number of openings/tags supported by the sim (i.e controller)
+		 		 * is evenly distributed between all devices that share this sim.
+		 		 */ 
+					cam_ccbq_resize(&devptr->ccbq, 
+			                                (devptr->sim->max_dev_openings/devptr->sim->dev_count));
+                                }
+                        }
+                }
+        }
 	mtx_unlock(&xsoftc.xpt_topo_lock);
 
 	return (status);
@@ -3072,6 +3095,11 @@ xpt_run_dev_allocq(struct cam_eb *bus)
 		}
 
 		/* We may have more work. Attempt to reschedule. */
+		device->runs_token++;
+		if (device->runs_token >= device->ccbq.queue.array_size) {
+			device->runs_token = 0;
+			break;
+		}
 		xpt_schedule_dev_allocq(bus, device);
 	}
 	devq->alloc_queue.qfrozen_cnt[0]--;
@@ -3139,7 +3167,6 @@ xpt_run_dev_sendq(struct cam_eb *bus)
 		devq->send_openings--;
 		devq->send_active++;
 
-		xpt_schedule_dev_sendq(bus, device);
 
 		if (work_ccb && (work_ccb->ccb_h.flags & CAM_DEV_QFREEZE) != 0){
 			/*
@@ -3170,6 +3197,13 @@ xpt_run_dev_sendq(struct cam_eb *bus)
 		 */
 		sim = work_ccb->ccb_h.path->bus->sim;
 		(*(sim->sim_action))(sim, work_ccb);
+
+		device->runs_token++;
+		if (device->runs_token >= device->ccbq.queue.array_size) {
+			device->runs_token = 0;
+			break;
+		}
+		xpt_schedule_dev_sendq(bus, device);
 	}
 	devq->send_queue.qfrozen_cnt[0]--;
 }
@@ -4285,6 +4319,7 @@ xpt_alloc_device(struct cam_eb *bus, str
 		device->tag_delay_count = 0;
 		device->tag_saved_openings = 0;
 		device->refcount = 1;
+		device->runs_token = 0;
 		callout_init_mtx(&device->callout, bus->sim->mtx, 0);
 
 		/*
diff -rup cam/cam_xpt_internal.h cam/cam_xpt_internal.h
--- cam/cam_xpt_internal.h	2010-06-13 19:09:06.000000000 -0700
+++ cam/cam_xpt_internal.h	2012-03-21 13:57:45.000000000 -0700
@@ -118,6 +118,7 @@ struct cam_ed {
 #define	CAM_TAG_DELAY_COUNT		5
 	u_int32_t	 tag_saved_openings;
 	u_int32_t	 refcount;
+	u_int32_t	 runs_token;
 	struct callout	 callout;
 };
 
diff -rup cam/scsi/scsi_da.c cam/scsi/scsi_da.c
--- cam/scsi/scsi_da.c	2010-06-13 19:09:06.000000000 -0700
+++ cam/scsi/scsi_da.c	2012-03-21 14:16:00.000000000 -0700
@@ -56,7 +56,13 @@ __FBSDID("$FreeBSD: src/sys/cam/scsi/scs
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
+#include <cam/cam_queue.h>
 #include <cam/cam_sim.h>
+#include <cam/cam_xpt.h>
+#include <cam/cam_xpt_sim.h>
+#include <cam/cam_xpt_periph.h>
+#include <cam/cam_xpt_internal.h>
+#include <cam/cam_debug.h>
 
 #include <cam/scsi/scsi_message.h>
 
@@ -1102,6 +1108,26 @@ dasysctlinit(void *context, int pending)
 		&softc->minimum_cmd_size, 0, dacmdsizesysctl, "I",
 		"Minimum CDB size");
 
+	SYSCTL_ADD_INT(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "outstanding_cmds", CTLTYPE_INT | CTLFLAG_RD,
+		&softc->outstanding_cmds, 0, "Outstanding CDB Cmds");
+
+	SYSCTL_ADD_INT(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "ccbq_devq_openings", CTLTYPE_INT | CTLFLAG_RD,
+		&periph->path->device->ccbq.devq_openings, 0, "CCBQ Dev Openings");
+
+	SYSCTL_ADD_INT(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "ccbq_array_size", CTLTYPE_INT | CTLFLAG_RW,
+		&periph->path->device->ccbq.queue.array_size, 0, "CCBQ Array Size");
+
+	SYSCTL_ADD_INT(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "sim_ccb_count", CTLTYPE_INT | CTLFLAG_RD,
+		&periph->sim->ccb_count, 0, "SIM CCB COUNT");
+
+	SYSCTL_ADD_INT(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "sim_devq_alloc_openings", CTLTYPE_INT | CTLFLAG_RD,
+		&periph->sim->devq->alloc_openings, 0, "SIM Devq Alloc Openings");
+
 	cam_periph_release(periph);
 }
 


>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list