PERFORCE change 123662 for review

Ulf Lilleengen lulf at FreeBSD.org
Tue Jul 17 21:09:27 UTC 2007


http://perforce.freebsd.org/chv.cgi?CH=123662

Change 123662 by lulf at lulf_carrot on 2007/07/17 21:08:43

	- Initial implementation of growing RAID-5 arrays. This is done by
	  splitting the offset calculation into one for read and one for write
	  operations. We make a distinction of subdisks that were added after
	  the plex is not newborn any longer and subdisks that were added at
	  creation/tasting time.  When a BIO write comes, the write will go to
	  the whole plex, but read operations will only be done on subdisks that
	  do not have the GV_SD_GROW flag set.  The bad thing with this is that
	  we must ensure that new subdisks are added to a later plexoffset
	  (which we should force, to make it easier for us, since there is not a
	  good reason why the user should be able to set the plexoffset in this
	  operation).  The implementation will probably change a bit.
	- Add another state called RESIZING, and a flag called GV_PLEX_GROWING
	  to indicate that a plex is in growing operation.
	- Make sure obvious parts of the code respects this flag. Will need to
	  look over this more though.

Affected files ...

.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum.h#21 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_events.c#8 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_init.c#14 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_plex.c#17 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_raid5.c#9 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_share.c#3 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_state.c#17 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_subr.c#19 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_var.h#22 edit

Differences ...

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum.h#21 (text+ko) ====

@@ -132,6 +132,7 @@
 
 void	gv_init_request(struct gv_sd *, off_t, caddr_t, off_t);
 void	gv_parity_request(struct gv_plex *, int, off_t);
+int	gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t);
 void	gv_parityop(struct gv_softc *, struct gctl_req *);
 
 #endif /* !_GEOM_VINUM_H_ */

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_events.c#8 (text+ko) ====


==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_init.c#14 (text+ko) ====

@@ -42,6 +42,7 @@
 static int	gv_sync(struct gv_volume *);
 static int	gv_rebuild_plex(struct gv_plex *);
 static int	gv_init_plex(struct gv_plex *);
+static int	gv_grow_plex(struct gv_plex *);
 
 void
 gv_start_obj(struct g_geom *gp, struct gctl_req *req)
@@ -115,6 +116,8 @@
 	else if (p->org == GV_PLEX_RAID5) {
 		if (p->state == GV_PLEX_DEGRADED)
 			error = gv_rebuild_plex(p);
+		else if (p->state == GV_PLEX_RESIZING)
+			error = gv_grow_plex(p);
 		else
 			error = gv_init_plex(p);
 	}
@@ -226,7 +229,8 @@
 		return (EBUSY);*/
 
 	if (p->flags & GV_PLEX_SYNCING ||
-	    p->flags & GV_PLEX_REBUILDING)
+	    p->flags & GV_PLEX_REBUILDING ||
+	    p->flags & GV_PLEX_GROWING)
 		return (EINPROGRESS);
 	p->flags |= GV_PLEX_REBUILDING;
 	p->synced = 0;
@@ -236,6 +240,54 @@
 }
 
 static int
+gv_grow_plex(struct gv_plex *p)
+{
+	struct gv_volume *v;
+	struct gv_sd *s;
+	off_t origsize, origlength;
+	int error, sdcount;
+
+	KASSERT(p != NULL, ("gv_grow_plex: NULL p"));
+	v = p->vol_sc;
+	KASSERT(v != NULL, ("gv_grow_plex: NULL v"));
+
+	printf ("Start growing\n");
+	if (p->flags & GV_PLEX_GROWING || 
+	    p->flags & GV_PLEX_SYNCING ||
+	    p->flags & GV_PLEX_REBUILDING)
+		return (EINPROGRESS);
+	if (p->state > GV_PLEX_RESIZING)
+		return (GV_ERR_INVSTATE);
+	g_topology_lock();
+	error = gv_access(v->provider, 1, 1, 0);
+	g_topology_unlock();
+	if (error) {
+		printf("VINUM: unable to access provider\n");
+		return (GV_ERR_ISOPEN); /*XXX: wrong errorcode */
+	}
+
+	/* XXX: This routine with finding origsize is used two other places as
+	 * well, so we should create a function for it. */
+	sdcount = p->sdcount;
+	LIST_FOREACH(s, &p->subdisks, in_plex) {
+		if (s->flags & GV_SD_GROW)
+			sdcount--;
+	}
+	s = LIST_FIRST(&p->subdisks);
+	if (s == NULL) {
+		printf("VINUM: error growing plex without subdisks");
+		return (GV_ERR_NOTFOUND);
+	}
+	p->flags |= GV_PLEX_GROWING;
+	origsize = (sdcount - 1) * s->size;
+	origlength = (sdcount - 1) * p->stripesize;
+	printf("Starting growing at 0 reading %jd bytes\n", origlength);
+	gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL);
+
+	return (0);
+}
+
+static int
 gv_init_plex(struct gv_plex *p)
 {
 	struct gv_drive *d;

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_plex.c#17 (text+ko) ====

@@ -47,6 +47,7 @@
 		    struct gv_raid5_packet *);
 static void	gv_parity_complete(struct gv_plex *, struct bio *);
 static void	gv_rebuild_complete(struct gv_plex *, struct bio *);
+static void	gv_grow_complete(struct gv_plex *, struct bio *);
 static void	gv_init_complete(struct gv_plex *, struct bio *);
 static struct bio * gv_plexbuffer(struct gv_plex *, struct bio *, caddr_t,
 			off_t, off_t, int *);
@@ -376,6 +377,8 @@
 			gv_rebuild_complete(p, pbp);
 		} else if (pbp->bio_cflags & GV_BIO_INIT) {
 			gv_init_complete(p, pbp);
+		} else if (p->state == GV_PLEX_RESIZING) {
+			gv_grow_complete(p, pbp);
 		} else {
 			g_io_deliver(pbp, pbp->bio_error);
 		}
@@ -535,6 +538,106 @@
 }
 
 /*
+ * Finish handling of a bio to a growing plex.
+ */
+void
+gv_grow_complete(struct gv_plex *p, struct bio *bp)
+{
+	struct gv_sd *s;
+	struct gv_volume *v;
+	off_t origsize, offset;
+	int sdcount, err;
+
+	v = p->vol_sc;
+	err = 0;
+
+	/* If it was a read, write it. */
+	if (bp->bio_cmd == BIO_READ) {
+		printf("Finished read, do a write\n");
+		err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
+		    BIO_WRITE, bp->bio_data);
+	/* If it was a write, read next. */
+	} else if (bp->bio_cmd == BIO_WRITE) {
+		if (bp->bio_cflags & GV_BIO_MALLOC)
+			g_free(bp->bio_data);
+
+		/* Find the real size of the plex. */
+		sdcount = p->sdcount;
+		LIST_FOREACH(s, &p->subdisks, in_plex) {
+			if (s->flags & GV_SD_GROW)
+				sdcount--;
+		}
+		s = LIST_FIRST(&p->subdisks);
+		/* XXX: hmm, nice way to solve it? */
+		if (s == NULL) {
+			printf("VINUM: error growing plex without subdisks");
+			return;
+		}
+		origsize = (s->size * (sdcount - 1));
+		if (bp->bio_offset + bp->bio_length >= origsize) {
+			printf("VINUM: growing of %s completed\n", p->name);
+			p->flags &= ~GV_PLEX_GROWING;
+			printf("Updating state\n");
+			LIST_FOREACH(s, &p->subdisks, in_plex) {
+				s->flags &= ~GV_SD_GROW;
+				gv_set_sd_state(s, GV_SD_UP, 0);
+			}
+			gv_set_plex_state(p, GV_PLEX_UP, 0);
+			g_topology_lock();
+			gv_access(v->provider, -1, -1, 0);
+			g_topology_unlock();
+		} else {
+			offset = bp->bio_offset + bp->bio_length;
+			printf("Issuing next bio read at 0x%jx\n", offset);
+			err = gv_grow_request(p, offset,
+			   MIN(bp->bio_length, origsize - offset),
+			   BIO_READ, NULL);
+		}
+	}
+	g_destroy_bio(bp);
+
+	if (err) {
+		p->flags &= ~GV_PLEX_GROWING;
+		printf("VINUM: error growing plex: error code %d\n", err);
+	}
+}
+
+/*
+ * Create a new bio struct for the next grow request.
+ */
+int
+gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
+    caddr_t data)
+{
+	struct bio *bp;
+
+	KASSERT(p != NULL, ("gv_grow_request: NULL p"));
+	bp = g_new_bio();
+	if (bp == NULL) {
+		printf("VINUM: grow of %s failed creating bio: "
+		    "out of memory\n", p->name);
+		return (ENOMEM);
+	}
+
+	bp->bio_cmd = type;
+	bp->bio_done = gv_done;
+	bp->bio_error = 0;
+	bp->bio_offset = offset;
+	bp->bio_length = length;
+	if (data == NULL)
+		data = g_malloc(length, M_WAITOK);
+	bp->bio_cflags |= GV_BIO_MALLOC;
+	bp->bio_data = data;
+
+	/* Send down. */
+	printf("Sending bio: ");
+	g_print_bio(bp);
+	printf("\n");
+	gv_plex_start(p, bp);
+	return (0);
+}
+
+/*
  * Handle a finished initialization BIO.
  */
 static void

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_raid5.c#9 (text+ko) ====

@@ -45,6 +45,8 @@
 
 static int		gv_raid5_offset(struct gv_plex *, off_t, off_t,
 			    off_t *, off_t *, int *, int *);
+static int		gv_raid5_offset_read(struct gv_plex *, off_t, off_t,
+			    off_t *, off_t *, int *, int *);
 static struct bio *	gv_raid5_clone_bio(struct bio *, struct gv_sd *,
 			    struct gv_raid5_packet *, caddr_t, int);
 static int	gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
@@ -341,8 +343,15 @@
 	type = REQ_TYPE_NORMAL;
 	original = parity = broken = NULL;
 
-	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
+	/* Reads must take into account the growing plexes. */
+	if (bp->bio_cmd == BIO_READ)
+		gv_raid5_offset_read(p, boff, bcount, &real_off, &real_len,
+		     &sdno, &psdno);
+	else
+		gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno,
+		    &psdno);
 
+	printf("Got sdno %d and psdno %d\n", sdno, psdno);
 	/* Find the right subdisks. */
 	i = 0;
 	LIST_FOREACH(s, &p->subdisks, in_plex) {
@@ -526,6 +535,70 @@
 	return (0);
 }
 
+/*
+ * Calculate the offsets in the various subdisks for a RAID5 request. Also take
+ * care of new subdisks in an expanded RAID5 array. 
+ * XXX: This assumes that the new subdisks are inserted after the others (which
+ * is okay as long as plex_offset is larger). If subdisks are inserted into the
+ * plexlist before, we get problems.
+ */
+static int
+gv_raid5_offset_read(struct gv_plex *p, off_t boff, off_t bcount,
+    off_t *real_off, off_t *real_len, int *sdno, int *psdno)
+{
+	struct gv_sd *s;
+	int sd, psd, sdcount;
+	off_t len_left, stripeend, stripeoff, stripestart;
+
+	printf("In read we take into account new subdisks.\n");
+	sdcount = p->sdcount;
+	LIST_FOREACH(s, &p->subdisks, in_plex) {
+		if (s->flags & GV_SD_GROW) {
+			printf("Decrease\n");
+			sdcount--;
+		}
+	}
+
+	/* The number of the subdisk containing the parity stripe. */
+	psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
+	    sdcount;
+	KASSERT(psdno >= 0, ("gv_raid5_offset_read: psdno < 0"));
+
+	/* Offset of the start address from the start of the stripe. */
+	stripeoff = boff % (p->stripesize * (sdcount - 1));
+	KASSERT(stripeoff >= 0, ("gv_raid5_offset_read: stripeoff < 0"));
+
+	/* The number of the subdisk where the stripe resides. */
+	sd = stripeoff / p->stripesize;
+	KASSERT(sdno >= 0, ("gv_raid5_offset_read: sdno < 0"));
+
+	/* At or past parity subdisk. */
+	if (sd >= psd)
+		sd++;
+
+	/* The offset of the stripe on this subdisk. */
+	stripestart = (boff - stripeoff) / (sdcount - 1);
+	KASSERT(stripestart >= 0, ("gv_raid5_offset_read: stripestart < 0"));
+
+	stripeoff %= p->stripesize;
+
+	/* The offset of the request on this subdisk. */
+	*real_off = stripestart + stripeoff;
+
+	stripeend = stripestart + p->stripesize;
+	len_left = stripeend - *real_off;
+	KASSERT(len_left >= 0, ("gv_raid5_offset_read: len_left < 0"));
+
+	*real_len = (bcount <= len_left) ? bcount : len_left;
+
+	if (sdno != NULL)
+		*sdno = sd;
+	if (psdno != NULL)
+		*psdno = psd;
+
+	return (0);
+}
+
 /* Calculate the offsets in the various subdisks for a RAID5 request. */
 static int
 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_share.c#3 (text+ko) ====

@@ -274,6 +274,8 @@
 		return (GV_PLEX_INITIALIZING);
 	else if (!strcmp(buf, "degraded"))
 		return (GV_PLEX_DEGRADED);
+	else if (!strcmp(buf, "resizing"))
+		return (GV_PLEX_RESIZING);
 	else
 		return (GV_PLEX_DOWN);
 }
@@ -287,6 +289,8 @@
 		return "down";
 	case GV_PLEX_INITIALIZING:
 		return "initializing";
+	case GV_PLEX_RESIZING:
+		return "resizing";
 	case GV_PLEX_DEGRADED:
 		return "degraded";
 	case GV_PLEX_UP:

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_state.c#17 (text+ko) ====

@@ -419,8 +419,12 @@
 	/* First, check the state of our subdisks. */
 	sdstates = gv_sdstatemap(p);
 	
+	/* If we're growing, don't change the state. */
+	if (p->flags & GV_PLEX_GROWING)
+		p->state = GV_PLEX_RESIZING;
+
 	/* If all subdisks are up, our plex can be up, too. */
-	if (sdstates == GV_SD_UPSTATE)
+	else if (sdstates == GV_SD_UPSTATE)
 		p->state = GV_PLEX_UP;
 
 	/* One or more of our subdisks are down. */

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_subr.c#19 (text+ko) ====

@@ -163,6 +163,8 @@
 			 * needed here (on-disk config parsing).
 			 */
 			s->flags &= ~GV_SD_NEWBORN;
+			s->flags &= ~GV_SD_GROW;
+			printf("S-flags is now: %d\n", s->flags);
 		}
 	}
 }
@@ -333,6 +335,13 @@
 		}
 		p->sddetached--;
 	} else {
+		if ((p->org == GV_PLEX_RAID5 ||
+		    p->org == GV_PLEX_STRIPED) &&
+		    !(p->flags & GV_PLEX_NEWBORN)) {
+			printf("Adding to a running plex, must add grow-flag to"
+			    " sd and plex\n");
+			s->flags |= GV_SD_GROW;
+		}
 		p->sdcount++;
 	}
 
@@ -451,8 +460,8 @@
 			}
 		}
 
-		/* Trim subdisk sizes so that they match the stripe size. */
 		LIST_FOREACH(s, &p->subdisks, in_plex) {
+			/* Trim subdisk sizes to match the stripe size. */
 			remainder = s->size % p->stripesize;
 			if (remainder) {
 				printf("VINUM: size of sd %s is not a "
@@ -473,6 +482,14 @@
 			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
 		p->flags &= ~GV_PLEX_ADDED;
 		gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
+	} else {
+		LIST_FOREACH(s, &p->subdisks, in_plex) {
+			if (s->flags & GV_SD_GROW) {
+				printf("Setting state\n");
+				p->state = GV_PLEX_RESIZING;
+				break;
+			}
+		}
 	}
 	/* Our plex is grown up now. */
 	p->flags &= ~GV_PLEX_NEWBORN;
@@ -1075,7 +1092,7 @@
 gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename)
 {
 	struct gv_sd *s2;
-	int error;
+	int error, sdcount;
 
 	g_topology_assert();
 
@@ -1099,6 +1116,7 @@
 	s->plex_offset = offset;
 	strlcpy(s->plex, p->name, GV_MAXPLEXNAME);
 
+	sdcount = p->sdcount;
 	error = gv_sd_to_plex(s, p);
 	if (error)
 		return (error);

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_var.h#22 (text+ko) ====

@@ -288,6 +288,7 @@
 #define	GV_SD_NEWBORN		0x01	/* Subdisk is created by user. */
 #define	GV_SD_TASTED		0x02	/* Subdisk is created during taste. */
 #define	GV_SD_CANGOUP		0x04	/* Subdisk can go up immediately. */
+#define GV_SD_GROW		0x08	/* Subdisk is added to striped plex. */
 
 	char drive[GV_MAXDRIVENAME];	/* Name of underlying drive. */
 	char plex[GV_MAXPLEXNAME];	/* Name of associated plex. */
@@ -309,8 +310,9 @@
 	int	state;			/* The plex state. */
 #define	GV_PLEX_DOWN		0
 #define	GV_PLEX_INITIALIZING	1
-#define	GV_PLEX_DEGRADED	2
-#define	GV_PLEX_UP		3
+#define GV_PLEX_RESIZING	2
+#define	GV_PLEX_DEGRADED	3
+#define	GV_PLEX_UP		4
 
 	int	org;			/* The plex organisation. */
 #define	GV_PLEX_DISORG	0
@@ -334,6 +336,7 @@
 #define	GV_PLEX_THREAD_DEAD	0x10	/* The RAID5 thread has died. */
 #define	GV_PLEX_NEWBORN		0x20	/* The plex was just created. */
 #define GV_PLEX_REBUILDING	0x40	/* The plex is rebuilding. */
+#define GV_PLEX_GROWING		0x80	/* The plex is growing. */
 
 	off_t	synced;			/* Count of synced bytes. */
 


More information about the p4-projects mailing list