kern/60313: data destruction: lseek() misalignment silently ignored by some block devices

Bruce Evans bde at zeta.org.au
Tue Dec 16 18:14:30 PST 2003


On Wed, 17 Dec 2003, Matthias Andree wrote:

> >Description:
> da(4) will happily return success on lseek() to some random position that is
> not aligned at a block boundary and start writing from the beginning, byte 0,
> of the block, ignoring the lseek().
>
> In other words, if I have a drive with 512 bytes per block, seek to offset 256
> and write 512 bytes, da(4) will scribble over the first 256 bytes as well.
> Either the seek or the write should be rejected.

The boundary is checked in dscheck() in RELENG_4, but only after
forgetting it mod DEV_BSIZE, so only offsets that are multiples of
DEV_BSIZE but not a multiple of the sector size are detected.

The boundary is forgotten here in physio() (RELENG_4 version):

% 			blockno = bp->b_offset >> DEV_BSHIFT;
% 			if (chk_blockno && (daddr_t)blockno != blockno) {
% 				error = EINVAL; /* blockno overflow */
% 				goto doerror;
% 			}
% 			bp->b_blkno = blockno;

except bp->b_offset is still available so lower layers can check it.  None
do.  (The acd driver doesn't use dscheck() and uses b_offset to determine
its own idea of the block number; it has a subset of dscheck()'s other
boundary checks and has the same nonexistent boundary checking as physio()
for the offset.)

> vn(4) will accept the seek but a subsequent write(2) will return the bogus
> EINVAL in errno.

I don't see how vn(4) can be missing the bug.  In the "labels" case it
uses dscheck() which doesn't check b_offset, and in the non-labels case
it uses its own check which also doesn't check b_offset.

> ad(4) status is unknown.

I don't see how it can be missing the bug.

> FreeBSD 5 status is unknown.

-current is quite different.  It deprecates b_blkno and uses b_offset
more (mostly in its bio_offset form), and uses GEOM (except in my
version).  physio() still sets b_blkno without checking the boundary,
but b_blkno is mostly not used.

The problem seems to be fixed in GEOM (function g_io_check()).

I fixed it my version of dscheck() but didn't get around
to committing the fix before dscheck() was axed.  Here are my current
diffs relative to RELENG_4.  They don't apply directly.  RELENG_4 needs
mainly the "if (offset % (uoff_t)DEV_BSIZE) {" clause.  The patch also
fixes some slightly incorrect choices of types that become fatally
incorrect with larger daddr_t's in -current, and fixes the longstanding
brokenness of EOF handling.

%%%
Index: subr_diskslice.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/Attic/subr_diskslice.c,v
retrieving revision 1.82.2.6
diff -u -2 -r1.82.2.6 subr_diskslice.c
--- subr_diskslice.c	24 Jul 2001 09:49:41 -0000	1.82.2.6
+++ subr_diskslice.c	7 Dec 2003 05:05:22 -0000
@@ -134,47 +284,55 @@
 int
 dscheck(bp, ssp)
-	struct buf *bp;
+	struct bio *bp;
 	struct diskslices *ssp;
 {
-	daddr_t	blkno;
-	u_long	endsecno;
-	daddr_t	labelsect;
+	daddr_t blkno;
+	u_long endsecno;
+	daddr_t labelsect;
 	struct disklabel *lp;
 	char *msg;
-	long	nsec;
+	long nsec;
+	off_t offset;
 	struct partition *pp;
-	daddr_t	secno;
-	daddr_t	slicerel_secno;
+	daddr_t secno;
+	daddr_t slicerel_secno;
 	struct diskslice *sp;
-	int s;

-	blkno = bp->b_blkno;
-	if (blkno < 0) {
-		printf("dscheck(%s): negative b_blkno %ld\n",
-		    devtoname(bp->b_dev), (long)blkno);
-		bp->b_error = EINVAL;
+	offset = bp->bio_offset;
+	if (offset < 0) {
+		printf("dscheck(%s): negative bio_offset %jd\n",
+		    devtoname(bp->bio_dev), (intmax_t)offset);
+		bp->bio_error = EINVAL;
+		goto bad;
+	}
+	if (offset % (uoff_t)DEV_BSIZE) {
+		printf(
+		"dscheck(%s): bio_offset %jd is not on a DEV_BSIZE boundary\n",
+		    devtoname(bp->bio_dev), (intmax_t)offset);
+		bp->bio_error = EINVAL;
 		goto bad;
 	}
-	sp = &ssp->dss_slices[dkslice(bp->b_dev)];
+	blkno = btodb(bp->bio_offset);
+	sp = &ssp->dss_slices[dkslice(bp->bio_dev)];
 	lp = sp->ds_label;
 	if (ssp->dss_secmult == 1) {
-		if (bp->b_bcount % (u_long)DEV_BSIZE)
+		if (bp->bio_bcount % (u_long)DEV_BSIZE)
 			goto bad_bcount;
 		secno = blkno;
-		nsec = bp->b_bcount >> DEV_BSHIFT;
+		nsec = bp->bio_bcount >> DEV_BSHIFT;
 	} else if (ssp->dss_secshift != -1) {
-		if (bp->b_bcount & (ssp->dss_secsize - 1))
+		if (bp->bio_bcount & (ssp->dss_secsize - 1))
 			goto bad_bcount;
 		if (blkno & (ssp->dss_secmult - 1))
-			goto bad_blkno;
+			goto bad_offset;
 		secno = blkno >> ssp->dss_secshift;
-		nsec = bp->b_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
+		nsec = bp->bio_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
 	} else {
-		if (bp->b_bcount % ssp->dss_secsize)
+		if (bp->bio_bcount % ssp->dss_secsize)
 			goto bad_bcount;
 		if (blkno % ssp->dss_secmult)
-			goto bad_blkno;
+			goto bad_offset;
 		secno = blkno / ssp->dss_secmult;
-		nsec = bp->b_bcount / ssp->dss_secsize;
+		nsec = bp->bio_bcount / ssp->dss_secsize;
 	}
 	if (lp == NULL) {
@@ -184,6 +342,6 @@
 	} else {
 		labelsect = lp->d_partitions[LABEL_PART].p_offset;
-if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
-		pp = &lp->d_partitions[dkpart(bp->b_dev)];
+		KASSERT(labelsect == 0, ("labelsect != 0 in dscheck()"));
+		pp = &lp->d_partitions[dkpart(bp->bio_dev)];
 		endsecno = pp->p_size;
 		slicerel_secno = pp->p_offset + secno;
@@ -196,6 +354,6 @@
 	    slicerel_secno + nsec > LABELSECTOR + labelsect &&
 #endif
-	    (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) {
-		bp->b_error = EROFS;
+	    bp->bio_cmd == BIO_WRITE && sp->ds_wlabel == 0) {
+		bp->bio_error = EROFS;
 		goto bad;
 	}
@@ -203,7 +361,7 @@
 #if defined(DOSBBSECTOR) && defined(notyet)
 	/* overwriting master boot record? */
-	if (slicerel_secno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 &&
+	if (slicerel_secno <= DOSBBSECTOR && bp->bio_cmd == BIO_WRITE &&
 	    sp->ds_wlabel == 0) {
-		bp->b_error = EROFS;
+		bp->bio_error = EROFS;
 		goto bad;
 	}
@@ -211,20 +369,19 @@

 	/* beyond partition? */
-	if (secno + nsec > endsecno) {
-		/* if exactly at end of disk, return an EOF */
-		if (secno == endsecno) {
-			bp->b_resid = bp->b_bcount;
-			return (0);
-		}
-		/* or truncate if part of it fits */
-		nsec = endsecno - secno;
-		if (nsec <= 0) {
-			bp->b_error = EINVAL;
+	if ((uintmax_t)secno + nsec > endsecno) {
+		/* If at or beyond end of partition, return EOF. */
+		if (secno >= endsecno) {
+			if (bp->bio_cmd == BIO_READ) {
+				bp->bio_resid = bp->bio_bcount;
+				return (0);
+			}
+			bp->bio_error = ENOSPC;
 			goto bad;
 		}
-		bp->b_bcount = nsec * ssp->dss_secsize;
+		/* Truncate to the part that fits. */
+		bp->bio_bcount = (endsecno - secno) * ssp->dss_secsize;
 	}

-	bp->b_pblkno = sp->ds_offset + slicerel_secno;
+	bp->bio_pblkno = sp->ds_offset + slicerel_secno;

 	/*
@@ -237,18 +394,18 @@
 	    && slicerel_secno + nsec > LABELSECTOR + labelsect
 #endif
+	    /* XXX following seems to be broken for dedicated case: */
 	    && sp->ds_offset != 0) {
 		struct iodone_chain *ic;

 		ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
-		ic->ic_prev_flags = bp->b_flags;
-		ic->ic_prev_iodone = bp->b_iodone;
-		ic->ic_prev_iodone_chain = bp->b_iodone_chain;
+		ic->ic_prev_flags = bp->bio_flags;
+		ic->ic_prev_iodone = bp->bio_done;
+		ic->ic_prev_iodone_chain = bp->bio_done_chain;
 		ic->ic_args[0].ia_long = (LABELSECTOR + labelsect -
 		    slicerel_secno) * ssp->dss_secsize;
 		ic->ic_args[1].ia_ptr = sp;
-		bp->b_flags |= B_CALL;
-		bp->b_iodone = dsiodone;
-		bp->b_iodone_chain = ic;
-		if (!(bp->b_flags & B_READ)) {
+		bp->bio_done = dsiodone;
+		bp->bio_done_chain = ic;
+		if (bp->bio_cmd != BIO_READ) {
 			/*
 			 * XXX even disklabel(8) writes directly so we need
@@ -260,18 +417,13 @@
 			 * temporarily corrupting the in-core copy.
 			 */
-			if (bp->b_vp != NULL) {
-				s = splbio();
-				bp->b_vp->v_numoutput++;
-				splx(s);
-			}
 			/* XXX need name here. */
 			msg = fixlabel((char *)NULL, sp,
 				       (struct disklabel *)
-				       (bp->b_data + ic->ic_args[0].ia_long),
+				       (bp->bio_data + ic->ic_args[0].ia_long),
 				       TRUE);
 			if (msg != NULL) {
 				printf("dscheck(%s): %s\n",
-				    devtoname(bp->b_dev), msg);
-				bp->b_error = EROFS;
+				    devtoname(bp->bio_dev), msg);
+				bp->bio_error = EROFS;
 				goto bad;
 			}
@@ -282,19 +434,19 @@
 bad_bcount:
 	printf(
-	"dscheck(%s): b_bcount %ld is not on a sector boundary (ssize %d)\n",
-	    devtoname(bp->b_dev), bp->b_bcount, ssp->dss_secsize);
-	bp->b_error = EINVAL;
+	"dscheck(%s): bio_bcount %ld is not on a sector boundary (ssize %d)\n",
+	    devtoname(bp->bio_dev), bp->bio_bcount, ssp->dss_secsize);
+	bp->bio_error = EINVAL;
 	goto bad;

-bad_blkno:
+bad_offset:
 	printf(
-	"dscheck(%s): b_blkno %ld is not on a sector boundary (ssize %d)\n",
-	    devtoname(bp->b_dev), (long)blkno, ssp->dss_secsize);
-	bp->b_error = EINVAL;
+	"dscheck(%s): bio_offset %jd is not on a sector boundary (ssize %d)\n",
+	    devtoname(bp->bio_dev), (intmax_t)offset, ssp->dss_secsize);
+	bp->bio_error = EINVAL;
 	goto bad;

 bad:
-	bp->b_resid = bp->b_bcount;
-	bp->b_flags |= B_ERROR;
+	bp->bio_resid = bp->bio_bcount;
+	bp->bio_flags |= BIO_ERROR;
 	return (-1);
 }
%%%

Bruce


More information about the freebsd-bugs mailing list