svn commit: r237328 - in head/sys/cam: . scsi

Kenneth D. Merry ken at FreeBSD.org
Wed Jun 20 17:08:01 UTC 2012


Author: ken
Date: Wed Jun 20 17:08:00 2012
New Revision: 237328
URL: http://svn.freebsd.org/changeset/base/237328

Log:
  Fix several reference counting and object lifetime issues between
  the pass(4) and enc(4) drivers and devfs.
  
  The pass(4) driver uses the destroy_dev_sched() routine to
  schedule its device node for destruction in a separate thread
  context.  It does this because the passcleanup() routine can get
  called indirectly from the passclose() routine, and that would
  cause a deadlock if the close routine tried to destroy its own
  device node.
  
  In any case, once a particular passthrough driver number, e.g.
  pass3, is destroyed, CAM considers that unit number (3 in this
  case) available for reuse.
  
  The problem is that devfs may not be done cleaning up the previous
  instance of pass3, and will panic if isn't done cleaning up the
  previous instance.
  
  The solution is to get a callback from devfs when the device node
  is removed, and make sure we hold a reference to the peripheral
  until that happens.
  
  Testing exposed some other cases where we have reference counting
  issues, and those were also fixed in the pass(4) driver.
  
  cam_periph.c:	In camperiphfree(), reorder some of the operations.
  
  		The peripheral destructor needs to be called before
  		the peripheral is removed from the peripheral is
  		removed from the list.  This is because once we
  		remove the peripheral from the list, and drop the
  		topology lock, the peripheral number may be reused.
  		But if the destructor hasn't been called yet, there
  		may still be resources hanging around (like devfs
  		nodes) that haven't been fully cleaned up.
  
  cam_xpt.c:	Add an argument to xpt_remove_periph() to indicate
  		whether the topology lock is already held.
  
  scsi_enc.c:	Acquire an extra reference to the peripheral during
  		registration, and release it once we get a callback
  		from devfs indicating that the device node is gone.
  
  		Call destroy_dev_sched_cb() in enc_oninvalidate()
  		instead of calling destroy_dev() in the cleanup
  		routine.
  
  scsi_pass.c:	Add reference counting to handle peripheral and
  		devfs object lifetime issues.
  
  		Add a reference to the peripheral and the devfs
  		node in the peripheral registration.
  
  		Don't attempt to add a physical path alias if the
  		peripheral has been marked invalid.
  
  		Release the devfs reference once the initial
  		physical path alias taskqueue run has completed.
  
  		Schedule devfs node destruction in the
  		passoninvalidate(), and release our peripheral
  		reference in a new routine, passdevgonecb() once
  		the devfs node is gone.  This allows the peripheral
  		to fully go away, and the peripheral destructor,
  		passcleanup(), will get called.
  
  MFC after:	3 days
  Sponsored by:	Spectra Logic

Modified:
  head/sys/cam/cam_periph.c
  head/sys/cam/cam_xpt.c
  head/sys/cam/cam_xpt_periph.h
  head/sys/cam/scsi/scsi_enc.c
  head/sys/cam/scsi/scsi_pass.c

Modified: head/sys/cam/cam_periph.c
==============================================================================
--- head/sys/cam/cam_periph.c	Wed Jun 20 16:51:14 2012	(r237327)
+++ head/sys/cam/cam_periph.c	Wed Jun 20 17:08:00 2012	(r237328)
@@ -258,7 +258,7 @@ failure:
 		break;
 	case 3:
 		CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
-		xpt_remove_periph(periph);
+		xpt_remove_periph(periph, /*topology_lock_held*/ 0);
 		/* FALLTHROUGH */
 	case 2:
 		xpt_lock_buses();
@@ -610,13 +610,38 @@ camperiphfree(struct cam_periph *periph)
 		return;
 	}
 
-	TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links);
-	(*p_drv)->generation++;
+	/*
+	 * The peripheral destructor semantics dictate calling with only the
+	 * SIM mutex held.  Since it might sleep, it should not be called
+	 * with the topology lock held.
+	 */
 	xpt_unlock_buses();
 
+	/*
+	 * We need to call the peripheral destructor prior to removing the
+	 * peripheral from the list.  Otherwise, we risk running into a
+	 * scenario where the peripheral unit number may get reused
+	 * (because it has been removed from the list), but some resources
+	 * used by the peripheral are still hanging around.  In particular,
+	 * the devfs nodes used by some peripherals like the pass(4) driver
+	 * aren't fully cleaned up until the destructor is run.  If the
+	 * unit number is reused before the devfs instance is fully gone,
+	 * devfs will panic.
+	 */
 	if (periph->periph_dtor != NULL)
 		periph->periph_dtor(periph);
-	xpt_remove_periph(periph);
+
+	/*
+	 * The peripheral list is protected by the topology lock.
+	 */
+	xpt_lock_buses();
+
+	TAILQ_REMOVE(&(*p_drv)->units, periph, unit_links);
+	(*p_drv)->generation++;
+
+	xpt_remove_periph(periph, /*topology_lock_held*/ 1);
+
+	xpt_unlock_buses();
 	CAM_DEBUG(periph->path, CAM_DEBUG_INFO, ("Periph destroyed\n"));
 
 	if (periph->flags & CAM_PERIPH_NEW_DEV_FOUND) {

Modified: head/sys/cam/cam_xpt.c
==============================================================================
--- head/sys/cam/cam_xpt.c	Wed Jun 20 16:51:14 2012	(r237327)
+++ head/sys/cam/cam_xpt.c	Wed Jun 20 17:08:00 2012	(r237328)
@@ -1026,7 +1026,7 @@ xpt_add_periph(struct cam_periph *periph
 }
 
 void
-xpt_remove_periph(struct cam_periph *periph)
+xpt_remove_periph(struct cam_periph *periph, int topology_lock_held)
 {
 	struct cam_ed *device;
 
@@ -1047,9 +1047,13 @@ xpt_remove_periph(struct cam_periph *per
 		SLIST_REMOVE(periph_head, periph, cam_periph, periph_links);
 	}
 
-	mtx_lock(&xsoftc.xpt_topo_lock);
+	if (topology_lock_held == 0)
+		mtx_lock(&xsoftc.xpt_topo_lock);
+
 	xsoftc.xpt_generation++;
-	mtx_unlock(&xsoftc.xpt_topo_lock);
+
+	if (topology_lock_held == 0)
+		mtx_unlock(&xsoftc.xpt_topo_lock);
 }
 
 

Modified: head/sys/cam/cam_xpt_periph.h
==============================================================================
--- head/sys/cam/cam_xpt_periph.h	Wed Jun 20 16:51:14 2012	(r237327)
+++ head/sys/cam/cam_xpt_periph.h	Wed Jun 20 17:08:00 2012	(r237328)
@@ -42,7 +42,8 @@ void		xpt_polled_action(union ccb *ccb);
 void		xpt_release_ccb(union ccb *released_ccb);
 void		xpt_schedule(struct cam_periph *perph, u_int32_t new_priority);
 int32_t		xpt_add_periph(struct cam_periph *periph);
-void		xpt_remove_periph(struct cam_periph *periph);
+void		xpt_remove_periph(struct cam_periph *periph,
+				  int topology_lock_held);
 void		xpt_announce_periph(struct cam_periph *periph,
 				    char *announce_string);
 #endif

Modified: head/sys/cam/scsi/scsi_enc.c
==============================================================================
--- head/sys/cam/scsi/scsi_enc.c	Wed Jun 20 16:51:14 2012	(r237327)
+++ head/sys/cam/scsi/scsi_enc.c	Wed Jun 20 17:08:00 2012	(r237328)
@@ -114,6 +114,16 @@ enc_init(void)
 }
 
 static void
+enc_devgonecb(void *arg)
+{
+	struct cam_periph *periph;
+
+	periph = (struct cam_periph *)arg;
+
+	cam_periph_release(periph);
+}
+
+static void
 enc_oninvalidate(struct cam_periph *periph)
 {
 	struct enc_softc *enc;
@@ -141,6 +151,8 @@ enc_oninvalidate(struct cam_periph *peri
 	}
 	callout_drain(&enc->status_updater);
 
+	destroy_dev_sched_cb(enc->enc_dev, enc_devgonecb, periph);
+
 	xpt_print(periph->path, "lost device\n");
 }
 
@@ -152,9 +164,7 @@ enc_dtor(struct cam_periph *periph)
 	enc = periph->softc;
 
 	xpt_print(periph->path, "removing device entry\n");
-	cam_periph_unlock(periph);
-	destroy_dev(enc->enc_dev);
-	cam_periph_lock(periph);
+
 
 	/* If the sub-driver has a cleanup routine, call it */
 	if (enc->enc_vec.softc_cleanup != NULL)
@@ -951,9 +961,19 @@ enc_ctor(struct cam_periph *periph, void
 			goto out;
 		}
 	}
+
+	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
+		xpt_print(periph->path, "%s: lost periph during "
+			  "registration!\n", __func__);
+		cam_periph_lock(periph);
+
+		return (CAM_REQ_CMP_ERR);
+	}
+
 	enc->enc_dev = make_dev(&enc_cdevsw, periph->unit_number,
 	    UID_ROOT, GID_OPERATOR, 0600, "%s%d",
 	    periph->periph_name, periph->unit_number);
+
 	cam_periph_lock(periph);
 	enc->enc_dev->si_drv1 = periph;
 

Modified: head/sys/cam/scsi/scsi_pass.c
==============================================================================
--- head/sys/cam/scsi/scsi_pass.c	Wed Jun 20 16:51:14 2012	(r237327)
+++ head/sys/cam/scsi/scsi_pass.c	Wed Jun 20 17:08:00 2012	(r237328)
@@ -55,7 +55,8 @@ __FBSDID("$FreeBSD$");
 typedef enum {
 	PASS_FLAG_OPEN			= 0x01,
 	PASS_FLAG_LOCKED		= 0x02,
-	PASS_FLAG_INVALID		= 0x04
+	PASS_FLAG_INVALID		= 0x04,
+	PASS_FLAG_INITIAL_PHYSPATH	= 0x08
 } pass_flags;
 
 typedef enum {
@@ -133,7 +134,18 @@ passinit(void)
 		printf("pass: Failed to attach master async callback "
 		       "due to status 0x%x!\n", status);
 	}
-	
+
+}
+
+static void
+passdevgonecb(void *arg)
+{
+	struct cam_periph *periph;
+
+	periph = (struct cam_periph *)arg;
+
+	xpt_print(periph->path, "%s: devfs entry is gone\n", __func__);
+	cam_periph_release(periph);
 }
 
 static void
@@ -151,6 +163,12 @@ passoninvalidate(struct cam_periph *peri
 	softc->flags |= PASS_FLAG_INVALID;
 
 	/*
+	 * Tell devfs this device has gone away, and ask for a callback
+	 * when it has cleaned up its state.
+	 */
+	destroy_dev_sched_cb(softc->dev, passdevgonecb, periph);
+
+	/*
 	 * XXX Return all queued I/O with ENXIO.
 	 * XXX Handle any transactions queued to the card
 	 *     with XPT_ABORT_CCB.
@@ -176,11 +194,6 @@ passcleanup(struct cam_periph *periph)
 	cam_periph_unlock(periph);
 	taskqueue_drain(taskqueue_thread, &softc->add_physpath_task);
 
-	/*
-	 * passcleanup() is indirectly a d_close method via passclose,
-	 * so using destroy_dev(9) directly can result in deadlock.
-	 */
-	destroy_dev_sched(softc->dev);
 	cam_periph_lock(periph);
 
 	free(softc, M_DEVBUF);
@@ -199,6 +212,12 @@ pass_add_physpath(void *context, int pen
 	 */
 	periph = context;
 	softc = periph->softc;
+	cam_periph_lock(periph);
+	if (periph->flags & CAM_PERIPH_INVALID) {
+		cam_periph_unlock(periph);
+		return;
+	}
+	cam_periph_unlock(periph);
 	physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
 	if (xpt_getattr(physpath, MAXPATHLEN,
 			"GEOM::physpath", periph->path) == 0
@@ -208,6 +227,19 @@ pass_add_physpath(void *context, int pen
 					softc->dev, softc->alias_dev, physpath);
 	}
 	free(physpath, M_DEVBUF);
+
+	/*
+	 * Now that we've made our alias, we no longer have to have a
+	 * reference to the device.
+	 */
+	cam_periph_lock(periph);
+	if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0) {
+		softc->flags |= PASS_FLAG_INITIAL_PHYSPATH;
+		cam_periph_unlock(periph);
+		dev_rel(softc->dev);
+	}
+	else
+		cam_periph_unlock(periph);
 }
 
 static void
@@ -281,12 +313,12 @@ passregister(struct cam_periph *periph, 
 
 	cgd = (struct ccb_getdev *)arg;
 	if (periph == NULL) {
-		printf("passregister: periph was NULL!!\n");
+		printf("%s: periph was NULL!!\n", __func__);
 		return(CAM_REQ_CMP_ERR);
 	}
 
 	if (cgd == NULL) {
-		printf("passregister: no getdev CCB, can't register device\n");
+		printf("%s: no getdev CCB, can't register device\n", __func__);
 		return(CAM_REQ_CMP_ERR);
 	}
 
@@ -294,8 +326,8 @@ passregister(struct cam_periph *periph, 
 					    M_DEVBUF, M_NOWAIT);
 
 	if (softc == NULL) {
-		printf("passregister: Unable to probe new device. "
-		       "Unable to allocate softc\n");				
+		printf("%s: Unable to probe new device. "
+		       "Unable to allocate softc\n", __func__);
 		return(CAM_REQ_CMP_ERR);
 	}
 
@@ -331,10 +363,31 @@ passregister(struct cam_periph *periph, 
 			  DEVSTAT_TYPE_PASS,
 			  DEVSTAT_PRIORITY_PASS);
 
+	/*
+	 * Acquire a reference to the periph before we create the devfs
+	 * instance for it.  We'll release this reference once the devfs
+	 * instance has been freed.
+	 */
+	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
+		xpt_print(periph->path, "%s: lost periph during "
+			  "registration!\n", __func__);
+		mtx_lock(periph->sim->mtx);
+		return (CAM_REQ_CMP_ERR);
+	}
+
 	/* Register the device */
 	softc->dev = make_dev(&pass_cdevsw, periph->unit_number,
 			      UID_ROOT, GID_OPERATOR, 0600, "%s%d",
 			      periph->periph_name, periph->unit_number);
+
+	/*
+	 * Now that we have made the devfs instance, hold a reference to it
+	 * until the task queue has run to setup the physical path alias.
+	 * That way devfs won't get rid of the device before we add our
+	 * alias.
+	 */
+	dev_ref(softc->dev);
+
 	mtx_lock(periph->sim->mtx);
 	softc->dev->si_drv1 = periph;
 


More information about the svn-src-all mailing list