[Patch] camcontrol - determine the defect list length

Mon Oct 25 15:55:09 UTC 2010

Hi,

Below is a patch for camcontrol from the tree at my work. It was introduced several years ago to resolve some issues that we were seeing in Adaptec's RAID controller firmware. The change is to the readdefects() function - when sending the request to read the defect list, instead of using a 65000-entry buffer, we wrote a new function to query the drive and determine the size of the defect list before allocating any memory. Apparently, if we send a maximum length longer than the number of bytes in the defect list, the cam call returns an overflow error due to what is probably a firmware bug.

It's probable that this issue has been fixed in Adaptec's firmware for a while now, and that this change is no longer necessary to us. However, it's conceivable that other devices could have the same problem, and as far as I can see, this patch shouldn't introduce any regressions - it essentially consists of sending an additional CAM request before getting the defect list.

I'd like to know whether this patch is worth trying to get committed into FreeBSD. If anyone has some comments or suggestions on the patch, I'd highly appreciate it.

Thanks,
-Mark

--- camcontrol.c@@/main/RELENG_8/RELENG_8_1/svos_8_1/0	2010-10-21 10:15:24.000000000 -0400
+++ camcontrol.c@@/main/RELENG_8/RELENG_8_1/svos_8_1/1	2010-10-21 15:39:04.000000000 -0400
@@ -202,6 +202,8 @@ static int scanlun_or_reset_dev(int bus,
 #ifndef MINIMALISTIC
 static int readdefects(struct cam_device *device, int argc, char **argv,
 		       char *combinedopt, int retry_count, int timeout);
+static int readdefectlen(struct cam_device *device, int retry_count, 
+			 int timeout, cam_argmask args, int *ret_len);
 static void modepage(struct cam_device *device, int argc, char **argv,
 		     char *combinedopt, int retry_count, int timeout);
 static int scsicmd(struct cam_device *device, int argc, char **argv, 
@@ -1722,7 +1724,7 @@ readdefects(struct cam_device *device, i
 	union ccb *ccb = NULL;
 	struct scsi_read_defect_data_10 *rdd_cdb;
 	u_int8_t *defect_list = NULL;
-	u_int32_t dlist_length = 65000;
+	u_int32_t dlist_length = 0;
 	u_int32_t returned_length = 0;
 	u_int32_t num_returned = 0;
 	u_int8_t returned_format;
@@ -1764,12 +1766,23 @@ readdefects(struct cam_device *device, i
 
 	ccb = cam_getccb(device);
 
-	/*
-	 * Hopefully 65000 bytes is enough to hold the defect list.  If it
-	 * isn't, the disk is probably dead already.  We'd have to go with
-	 * 12 byte command (i.e. alloc_length is 32 bits instead of 16)
-	 * to hold them all.
-	 */
+	/* get the defect list length
+	 * We need to do this as a workaround for a bug in either the Adaptec RAID or Hitachi scsi
+	 * firmware. For some reason, if we give them a max length greater than the number of bytes
+	 * in the dlist + the header length, the cam call returns an overflow error.
+	 * It doesn't look like it's in the driver, as this error seems to be populated by the SRB
+	 * error, and the SRB is an interpreted byte array stored in the FIB, leading me to believe
+	 * that it is directly populated by the firmware.
+	 */
+	if((error = readdefectlen(device, retry_count, timeout, arglist, &dlist_length)))
+	{
+		warnx("unable to get defect list length");
+		goto defect_bailout;
+	}
+	
+	/* the +=4 is to hold the header */
+	dlist_length +=4;
+	
 	defect_list = malloc(dlist_length);
 	if (defect_list == NULL) {
 		warnx("can't malloc memory for defect list");
@@ -2004,6 +2017,170 @@ defect_bailout:
 
 	return(error);
 }
+
+/* returns the number of bytes in the appropriate defect list */
+static int readdefectlen(struct cam_device *device, int retry_count, 
+			 int timeout, cam_argmask args, int *ret_len)
+{
+	struct scsi_read_defect_data_10 *rdd_cdb;
+	u_int8_t *defect_list = NULL;
+	union ccb *ccb = NULL;
+
+	u_int8_t returned_format;
+	/*4 bytes will hold the defect header*/
+	u_int32_t dlist_length = 4;
+	u_int32_t returned_length = 0;
+	
+	int error = 0;
+	int lists_specified = 0;
+
+	ccb = cam_getccb(device);
+
+	defect_list = malloc(dlist_length);
+	if (defect_list == NULL) {
+		warnx("can't malloc memory for defect list");
+		error = 1;
+		goto defect_len_bailout;
+	}
+
+	rdd_cdb =(struct scsi_read_defect_data_10 *)&ccb->csio.cdb_io.cdb_bytes;
+
+	/*Essentially just delete stuff from here on*/
+	/*
+	 * cam_getccb() zeros the CCB header only.  So we need to zero the
+	 * payload portion of the ccb.
+	 */
+	bzero(&(&ccb->ccb_h)[1],
+	      sizeof(struct ccb_scsiio) - sizeof(struct ccb_hdr));
+
+	cam_fill_csio(&ccb->csio,
+		      /*retries*/ retry_count,
+		      /*cbfcnp*/ NULL,
+		      /*flags*/ CAM_DIR_IN | ((arglist & CAM_ARG_ERR_RECOVER) ?
+					      CAM_PASS_ERR_RECOVER : 0),
+		      /*tag_action*/ MSG_SIMPLE_Q_TAG,
+		      /*data_ptr*/ defect_list,
+		      /*dxfer_len*/ dlist_length,
+		      /*sense_len*/ SSD_FULL_SIZE,
+		      /*cdb_len*/ sizeof(struct scsi_read_defect_data_10),
+		      /*timeout*/ timeout ? timeout : 5000);
+
+	rdd_cdb->opcode = READ_DEFECT_DATA_10;
+	if (args & CAM_ARG_FORMAT_BLOCK)
+		rdd_cdb->format = SRDD10_BLOCK_FORMAT;
+	else if (args & CAM_ARG_FORMAT_BFI)
+		rdd_cdb->format = SRDD10_BYTES_FROM_INDEX_FORMAT;
+	else if (args & CAM_ARG_FORMAT_PHYS)
+		rdd_cdb->format = SRDD10_PHYSICAL_SECTOR_FORMAT;
+	else {
+		error = 1;
+		warnx("no defect list format specified");
+		goto defect_len_bailout;
+	}
+	if (args & CAM_ARG_PLIST) {
+		rdd_cdb->format |= SRDD10_PLIST;
+		lists_specified++;
+	}
+
+	if (args & CAM_ARG_GLIST) {
+		rdd_cdb->format |= SRDD10_GLIST;
+		lists_specified++;
+	}
+
+	scsi_ulto2b(dlist_length, rdd_cdb->alloc_length);
+
+	/* Disable freezing the device queue */
+	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
+
+	if (cam_send_ccb(device, ccb) < 0) {
+		perror("error reading defect list");
+
+		if (args & CAM_ARG_VERBOSE) {
+			cam_error_print(device, ccb, CAM_ESF_ALL,
+					CAM_EPF_ALL, stderr);
+		}
+
+		error = 1;
+		goto defect_len_bailout;
+	}
+
+	returned_length = scsi_2btoul(((struct
+		scsi_read_defect_data_hdr_10 *)defect_list)->length);
+
+	returned_format = ((struct scsi_read_defect_data_hdr_10 *)
+			defect_list)->format;
+
+	if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR)
+	 && (ccb->csio.scsi_status == SCSI_STATUS_CHECK_COND)
+	 && ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)) {
+		struct scsi_sense_data *sense;
+		int error_code, sense_key, asc, ascq;
+
+		sense = &ccb->csio.sense_data;
+		scsi_extract_sense(sense, &error_code, &sense_key, &asc, &ascq);
+
+		/*
+		 * According to the SCSI spec, if the disk doesn't support
+		 * the requested format, it will generally return a sense
+		 * key of RECOVERED ERROR, and an additional sense code
+		 * of "DEFECT LIST NOT FOUND".  So, we check for that, and
+		 * also check to make sure that the returned length is
+		 * greater than 0, and then print out whatever format the
+		 * disk gave us.
+		 */
+		if ((sense_key == SSD_KEY_RECOVERED_ERROR)
+		 && (asc == 0x1c) && (ascq == 0x00)
+		 && (returned_length > 0)) {
+			warnx("requested defect format not available");
+			switch(returned_format & SRDDH10_DLIST_FORMAT_MASK) {
+			case SRDD10_BLOCK_FORMAT:
+				warnx("Device returned block format");
+				break;
+			case SRDD10_BYTES_FROM_INDEX_FORMAT:
+				warnx("Device returned bytes from index"
+				      " format");
+				break;
+			case SRDD10_PHYSICAL_SECTOR_FORMAT:
+				warnx("Device returned physical sector format");
+				break;
+			default:
+				error = 1;
+				warnx("Device returned unknown defect"
+				     " data format %#x", returned_format);
+				goto defect_len_bailout;
+				break; /* NOTREACHED */
+			}
+		} else {
+			error = 1;
+			warnx("Error returned from read defect data command");
+			if (args & CAM_ARG_VERBOSE)
+				cam_error_print(device, ccb, CAM_ESF_ALL,
+						CAM_EPF_ALL, stderr);
+			goto defect_len_bailout;
+		}
+	} else if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
+		error = 1;
+		warnx("Error returned from read defect data command");
+		if (args & CAM_ARG_VERBOSE)
+			cam_error_print(device, ccb, CAM_ESF_ALL,
+					CAM_EPF_ALL, stderr);
+		goto defect_len_bailout;
+	}
+
+	*ret_len = returned_length;
+	return 0;
+defect_len_bailout:
+
+	if (defect_list != NULL)
+		free(defect_list);
+
+	if (ccb != NULL)
+		cam_freeccb(ccb);
+
+	return(error);
+
+}
+
 #endif /* MINIMALISTIC */
 
 #if 0