PERFORCE change 123662 for review
Ulf Lilleengen
lulf at FreeBSD.org
Tue Jul 17 21:09:27 UTC 2007
http://perforce.freebsd.org/chv.cgi?CH=123662
Change 123662 by lulf at lulf_carrot on 2007/07/17 21:08:43
- Initial implementation of growing RAID-5 arrays. This is done by
splitting the offset calculation into one for read and one for write
operations. We make a distinction of subdisks that were added after
the plex is not newborn any longer and subdisks that were added at
creation/tasting time. When a BIO write comes, the write will go to
the whole plex, but read operations will only be done on subdisks that
do not have the GV_SD_GROW flag set. The bad thing with this is that
we must ensure that new subdisks are added to a later plexoffset
(which we should force, to make it easier for us, since there is not a
good reason why the user should be able to set the plexoffset in this
operation). The implementation will probably change a bit.
- Add another state called RESIZING, and a flag called GV_PLEX_GROWING
to indicate that a plex is in growing operation.
- Make sure obvious parts of the code respects this flag. Will need to
look over this more though.
Affected files ...
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum.h#21 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_events.c#8 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_init.c#14 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_plex.c#17 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_raid5.c#9 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_share.c#3 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_state.c#17 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_subr.c#19 edit
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_var.h#22 edit
Differences ...
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum.h#21 (text+ko) ====
@@ -132,6 +132,7 @@
void gv_init_request(struct gv_sd *, off_t, caddr_t, off_t);
void gv_parity_request(struct gv_plex *, int, off_t);
+int gv_grow_request(struct gv_plex *, off_t, off_t, int, caddr_t);
void gv_parityop(struct gv_softc *, struct gctl_req *);
#endif /* !_GEOM_VINUM_H_ */
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_events.c#8 (text+ko) ====
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_init.c#14 (text+ko) ====
@@ -42,6 +42,7 @@
static int gv_sync(struct gv_volume *);
static int gv_rebuild_plex(struct gv_plex *);
static int gv_init_plex(struct gv_plex *);
+static int gv_grow_plex(struct gv_plex *);
void
gv_start_obj(struct g_geom *gp, struct gctl_req *req)
@@ -115,6 +116,8 @@
else if (p->org == GV_PLEX_RAID5) {
if (p->state == GV_PLEX_DEGRADED)
error = gv_rebuild_plex(p);
+ else if (p->state == GV_PLEX_RESIZING)
+ error = gv_grow_plex(p);
else
error = gv_init_plex(p);
}
@@ -226,7 +229,8 @@
return (EBUSY);*/
if (p->flags & GV_PLEX_SYNCING ||
- p->flags & GV_PLEX_REBUILDING)
+ p->flags & GV_PLEX_REBUILDING ||
+ p->flags & GV_PLEX_GROWING)
return (EINPROGRESS);
p->flags |= GV_PLEX_REBUILDING;
p->synced = 0;
@@ -236,6 +240,54 @@
}
static int
+gv_grow_plex(struct gv_plex *p)
+{
+ struct gv_volume *v;
+ struct gv_sd *s;
+ off_t origsize, origlength;
+ int error, sdcount;
+
+ KASSERT(p != NULL, ("gv_grow_plex: NULL p"));
+ v = p->vol_sc;
+ KASSERT(v != NULL, ("gv_grow_plex: NULL v"));
+
+ printf ("Start growing\n");
+ if (p->flags & GV_PLEX_GROWING ||
+ p->flags & GV_PLEX_SYNCING ||
+ p->flags & GV_PLEX_REBUILDING)
+ return (EINPROGRESS);
+ if (p->state > GV_PLEX_RESIZING)
+ return (GV_ERR_INVSTATE);
+ g_topology_lock();
+ error = gv_access(v->provider, 1, 1, 0);
+ g_topology_unlock();
+ if (error) {
+ printf("VINUM: unable to access provider\n");
+ return (GV_ERR_ISOPEN); /*XXX: wrong errorcode */
+ }
+
+ /* XXX: This routine with finding origsize is used two other places as
+ * well, so we should create a function for it. */
+ sdcount = p->sdcount;
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ if (s->flags & GV_SD_GROW)
+ sdcount--;
+ }
+ s = LIST_FIRST(&p->subdisks);
+ if (s == NULL) {
+ printf("VINUM: error growing plex without subdisks");
+ return (GV_ERR_NOTFOUND);
+ }
+ p->flags |= GV_PLEX_GROWING;
+ origsize = (sdcount - 1) * s->size;
+ origlength = (sdcount - 1) * p->stripesize;
+ printf("Starting growing at 0 reading %jd bytes\n", origlength);
+ gv_grow_request(p, 0, MIN(origlength, origsize), BIO_READ, NULL);
+
+ return (0);
+}
+
+static int
gv_init_plex(struct gv_plex *p)
{
struct gv_drive *d;
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_plex.c#17 (text+ko) ====
@@ -47,6 +47,7 @@
struct gv_raid5_packet *);
static void gv_parity_complete(struct gv_plex *, struct bio *);
static void gv_rebuild_complete(struct gv_plex *, struct bio *);
+static void gv_grow_complete(struct gv_plex *, struct bio *);
static void gv_init_complete(struct gv_plex *, struct bio *);
static struct bio * gv_plexbuffer(struct gv_plex *, struct bio *, caddr_t,
off_t, off_t, int *);
@@ -376,6 +377,8 @@
gv_rebuild_complete(p, pbp);
} else if (pbp->bio_cflags & GV_BIO_INIT) {
gv_init_complete(p, pbp);
+ } else if (p->state == GV_PLEX_RESIZING) {
+ gv_grow_complete(p, pbp);
} else {
g_io_deliver(pbp, pbp->bio_error);
}
@@ -535,6 +538,106 @@
}
/*
+ * Finish handling of a bio to a growing plex.
+ */
+void
+gv_grow_complete(struct gv_plex *p, struct bio *bp)
+{
+ struct gv_sd *s;
+ struct gv_volume *v;
+ off_t origsize, offset;
+ int sdcount, err;
+
+ v = p->vol_sc;
+ err = 0;
+
+ /* If it was a read, write it. */
+ if (bp->bio_cmd == BIO_READ) {
+ printf("Finished read, do a write\n");
+ err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
+ BIO_WRITE, bp->bio_data);
+ /* If it was a write, read next. */
+ } else if (bp->bio_cmd == BIO_WRITE) {
+ if (bp->bio_cflags & GV_BIO_MALLOC)
+ g_free(bp->bio_data);
+
+ /* Find the real size of the plex. */
+ sdcount = p->sdcount;
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ if (s->flags & GV_SD_GROW)
+ sdcount--;
+ }
+ s = LIST_FIRST(&p->subdisks);
+ /* XXX: hmm, nice way to solve it? */
+ if (s == NULL) {
+ printf("VINUM: error growing plex without subdisks");
+ return;
+ }
+ origsize = (s->size * (sdcount - 1));
+ if (bp->bio_offset + bp->bio_length >= origsize) {
+ printf("VINUM: growing of %s completed\n", p->name);
+ p->flags &= ~GV_PLEX_GROWING;
+ printf("Updating state\n");
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ s->flags &= ~GV_SD_GROW;
+ gv_set_sd_state(s, GV_SD_UP, 0);
+ }
+ gv_set_plex_state(p, GV_PLEX_UP, 0);
+ g_topology_lock();
+ gv_access(v->provider, -1, -1, 0);
+ g_topology_unlock();
+ } else {
+ offset = bp->bio_offset + bp->bio_length;
+ printf("Issuing next bio read at 0x%jx\n", offset);
+ err = gv_grow_request(p, offset,
+ MIN(bp->bio_length, origsize - offset),
+ BIO_READ, NULL);
+ }
+ }
+ g_destroy_bio(bp);
+
+ if (err) {
+ p->flags &= ~GV_PLEX_GROWING;
+ printf("VINUM: error growing plex: error code %d\n", err);
+ }
+}
+
+/*
+ * Create a new bio struct for the next grow request.
+ */
+int
+gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
+ caddr_t data)
+{
+ struct bio *bp;
+
+ KASSERT(p != NULL, ("gv_grow_request: NULL p"));
+ bp = g_new_bio();
+ if (bp == NULL) {
+ printf("VINUM: grow of %s failed creating bio: "
+ "out of memory\n", p->name);
+ return (ENOMEM);
+ }
+
+ bp->bio_cmd = type;
+ bp->bio_done = gv_done;
+ bp->bio_error = 0;
+ bp->bio_offset = offset;
+ bp->bio_length = length;
+ if (data == NULL)
+ data = g_malloc(length, M_WAITOK);
+ bp->bio_cflags |= GV_BIO_MALLOC;
+ bp->bio_data = data;
+
+ /* Send down. */
+ printf("Sending bio: ");
+ g_print_bio(bp);
+ printf("\n");
+ gv_plex_start(p, bp);
+ return (0);
+}
+
+/*
* Handle a finished initialization BIO.
*/
static void
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_raid5.c#9 (text+ko) ====
@@ -45,6 +45,8 @@
static int gv_raid5_offset(struct gv_plex *, off_t, off_t,
off_t *, off_t *, int *, int *);
+static int gv_raid5_offset_read(struct gv_plex *, off_t, off_t,
+ off_t *, off_t *, int *, int *);
static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *,
struct gv_raid5_packet *, caddr_t, int);
static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *,
@@ -341,8 +343,15 @@
type = REQ_TYPE_NORMAL;
original = parity = broken = NULL;
- gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
+ /* Reads must take into account the growing plexes. */
+ if (bp->bio_cmd == BIO_READ)
+ gv_raid5_offset_read(p, boff, bcount, &real_off, &real_len,
+ &sdno, &psdno);
+ else
+ gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno,
+ &psdno);
+ printf("Got sdno %d and psdno %d\n", sdno, psdno);
/* Find the right subdisks. */
i = 0;
LIST_FOREACH(s, &p->subdisks, in_plex) {
@@ -526,6 +535,70 @@
return (0);
}
+/*
+ * Calculate the offsets in the various subdisks for a RAID5 request. Also take
+ * care of new subdisks in an expanded RAID5 array.
+ * XXX: This assumes that the new subdisks are inserted after the others (which
+ * is okay as long as plex_offset is larger). If subdisks are inserted into the
+ * plexlist before, we get problems.
+ */
+static int
+gv_raid5_offset_read(struct gv_plex *p, off_t boff, off_t bcount,
+ off_t *real_off, off_t *real_len, int *sdno, int *psdno)
+{
+ struct gv_sd *s;
+ int sd, psd, sdcount;
+ off_t len_left, stripeend, stripeoff, stripestart;
+
+ printf("In read we take into account new subdisks.\n");
+ sdcount = p->sdcount;
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ if (s->flags & GV_SD_GROW) {
+ printf("Decrease\n");
+ sdcount--;
+ }
+ }
+
+ /* The number of the subdisk containing the parity stripe. */
+ psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) %
+ sdcount;
+ KASSERT(psdno >= 0, ("gv_raid5_offset_read: psdno < 0"));
+
+ /* Offset of the start address from the start of the stripe. */
+ stripeoff = boff % (p->stripesize * (sdcount - 1));
+ KASSERT(stripeoff >= 0, ("gv_raid5_offset_read: stripeoff < 0"));
+
+ /* The number of the subdisk where the stripe resides. */
+ sd = stripeoff / p->stripesize;
+ KASSERT(sdno >= 0, ("gv_raid5_offset_read: sdno < 0"));
+
+ /* At or past parity subdisk. */
+ if (sd >= psd)
+ sd++;
+
+ /* The offset of the stripe on this subdisk. */
+ stripestart = (boff - stripeoff) / (sdcount - 1);
+ KASSERT(stripestart >= 0, ("gv_raid5_offset_read: stripestart < 0"));
+
+ stripeoff %= p->stripesize;
+
+ /* The offset of the request on this subdisk. */
+ *real_off = stripestart + stripeoff;
+
+ stripeend = stripestart + p->stripesize;
+ len_left = stripeend - *real_off;
+ KASSERT(len_left >= 0, ("gv_raid5_offset_read: len_left < 0"));
+
+ *real_len = (bcount <= len_left) ? bcount : len_left;
+
+ if (sdno != NULL)
+ *sdno = sd;
+ if (psdno != NULL)
+ *psdno = psd;
+
+ return (0);
+}
+
/* Calculate the offsets in the various subdisks for a RAID5 request. */
static int
gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_share.c#3 (text+ko) ====
@@ -274,6 +274,8 @@
return (GV_PLEX_INITIALIZING);
else if (!strcmp(buf, "degraded"))
return (GV_PLEX_DEGRADED);
+ else if (!strcmp(buf, "resizing"))
+ return (GV_PLEX_RESIZING);
else
return (GV_PLEX_DOWN);
}
@@ -287,6 +289,8 @@
return "down";
case GV_PLEX_INITIALIZING:
return "initializing";
+ case GV_PLEX_RESIZING:
+ return "resizing";
case GV_PLEX_DEGRADED:
return "degraded";
case GV_PLEX_UP:
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_state.c#17 (text+ko) ====
@@ -419,8 +419,12 @@
/* First, check the state of our subdisks. */
sdstates = gv_sdstatemap(p);
+ /* If we're growing, don't change the state. */
+ if (p->flags & GV_PLEX_GROWING)
+ p->state = GV_PLEX_RESIZING;
+
/* If all subdisks are up, our plex can be up, too. */
- if (sdstates == GV_SD_UPSTATE)
+ else if (sdstates == GV_SD_UPSTATE)
p->state = GV_PLEX_UP;
/* One or more of our subdisks are down. */
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_subr.c#19 (text+ko) ====
@@ -163,6 +163,8 @@
* needed here (on-disk config parsing).
*/
s->flags &= ~GV_SD_NEWBORN;
+ s->flags &= ~GV_SD_GROW;
+ printf("S-flags is now: %d\n", s->flags);
}
}
}
@@ -333,6 +335,13 @@
}
p->sddetached--;
} else {
+ if ((p->org == GV_PLEX_RAID5 ||
+ p->org == GV_PLEX_STRIPED) &&
+ !(p->flags & GV_PLEX_NEWBORN)) {
+ printf("Adding to a running plex, must add grow-flag to"
+ " sd and plex\n");
+ s->flags |= GV_SD_GROW;
+ }
p->sdcount++;
}
@@ -451,8 +460,8 @@
}
}
- /* Trim subdisk sizes so that they match the stripe size. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
+ /* Trim subdisk sizes to match the stripe size. */
remainder = s->size % p->stripesize;
if (remainder) {
printf("VINUM: size of sd %s is not a "
@@ -473,6 +482,14 @@
gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE);
p->flags &= ~GV_PLEX_ADDED;
gv_set_plex_state(p, GV_PLEX_DOWN, GV_SETSTATE_FORCE);
+ } else {
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ if (s->flags & GV_SD_GROW) {
+ printf("Setting state\n");
+ p->state = GV_PLEX_RESIZING;
+ break;
+ }
+ }
}
/* Our plex is grown up now. */
p->flags &= ~GV_PLEX_NEWBORN;
@@ -1075,7 +1092,7 @@
gv_attach_sd(struct gv_sd *s, struct gv_plex *p, off_t offset, int rename)
{
struct gv_sd *s2;
- int error;
+ int error, sdcount;
g_topology_assert();
@@ -1099,6 +1116,7 @@
s->plex_offset = offset;
strlcpy(s->plex, p->name, GV_MAXPLEXNAME);
+ sdcount = p->sdcount;
error = gv_sd_to_plex(s, p);
if (error)
return (error);
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/geom/vinum/geom_vinum_var.h#22 (text+ko) ====
@@ -288,6 +288,7 @@
#define GV_SD_NEWBORN 0x01 /* Subdisk is created by user. */
#define GV_SD_TASTED 0x02 /* Subdisk is created during taste. */
#define GV_SD_CANGOUP 0x04 /* Subdisk can go up immediately. */
+#define GV_SD_GROW 0x08 /* Subdisk is added to striped plex. */
char drive[GV_MAXDRIVENAME]; /* Name of underlying drive. */
char plex[GV_MAXPLEXNAME]; /* Name of associated plex. */
@@ -309,8 +310,9 @@
int state; /* The plex state. */
#define GV_PLEX_DOWN 0
#define GV_PLEX_INITIALIZING 1
-#define GV_PLEX_DEGRADED 2
-#define GV_PLEX_UP 3
+#define GV_PLEX_RESIZING 2
+#define GV_PLEX_DEGRADED 3
+#define GV_PLEX_UP 4
int org; /* The plex organisation. */
#define GV_PLEX_DISORG 0
@@ -334,6 +336,7 @@
#define GV_PLEX_THREAD_DEAD 0x10 /* The RAID5 thread has died. */
#define GV_PLEX_NEWBORN 0x20 /* The plex was just created. */
#define GV_PLEX_REBUILDING 0x40 /* The plex is rebuilding. */
+#define GV_PLEX_GROWING 0x80 /* The plex is growing. */
off_t synced; /* Count of synced bytes. */
More information about the p4-projects
mailing list