svn commit: r217311 - projects/graid/head/sys/geom/raid

Alexander Motin mav at FreeBSD.org
Wed Jan 12 16:00:49 UTC 2011


Author: mav
Date: Wed Jan 12 16:00:49 2011
New Revision: 217311
URL: http://svn.freebsd.org/changeset/base/217311

Log:
  Create all disks and subdisks as described by metadata during array startup
  even if real disks are absent. Objects created for absent disks are used
  to store information required to write metadata when array is degraded.
  
  Check drive for being stale based on it's status in most fresh metadata
  version found, not just by comparing generations. Generations can get out
  of sync during some emergancy shutdown during metadata write, but it
  doesn't mean that disk should be dropped.

Modified:
  projects/graid/head/sys/geom/raid/md_intel.c

Modified: projects/graid/head/sys/geom/raid/md_intel.c
==============================================================================
--- projects/graid/head/sys/geom/raid/md_intel.c	Wed Jan 12 15:05:31 2011	(r217310)
+++ projects/graid/head/sys/geom/raid/md_intel.c	Wed Jan 12 16:00:49 2011	(r217311)
@@ -304,6 +304,19 @@ intel_meta_copy(struct intel_raid_conf *
 	return (nmeta);
 }
 
+static int
+intel_meta_find_disk(struct intel_raid_conf *meta, char *serial)
+{
+	int pos;
+
+	for (pos = 0; pos < meta->total_disks; pos++) {
+		if (strncmp(meta->disk[pos].serial,
+		    serial, INTEL_SERIAL_LEN) == 0)
+			return (pos);
+	}
+	return (-1);
+}
+
 static struct intel_raid_conf *
 intel_meta_read(struct g_consumer *cp)
 {
@@ -413,19 +426,19 @@ intel_meta_write(struct g_consumer *cp, 
 	return (error);
 }
 
-#if 0
 static struct g_raid_disk *
 g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id)
 {
 	struct g_raid_disk	*disk;
+	struct g_raid_md_intel_perdisk *pd;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
-		if ((intptr_t)(disk->d_md_data) == id)
+		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
+		if (pd->pd_disk_pos == id)
 			break;
 	}
 	return (disk);
 }
-#endif
 
 static struct g_raid_volume *
 g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id)
@@ -443,51 +456,53 @@ static void
 g_raid_md_intel_start_disk(struct g_raid_disk *disk)
 {
 	struct g_raid_softc *sc;
-	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
+	struct g_raid_disk *olddisk;
 	struct g_raid_md_object *md;
 	struct g_raid_md_intel_object *mdi;
-	struct g_raid_md_intel_perdisk *pd;
-	struct intel_raid_conf *meta, *pdmeta;
-	struct intel_raid_vol *mvol;
-	struct intel_raid_map *mmap;
-	int i, j;
+	struct g_raid_md_intel_perdisk *pd, *oldpd;
+	struct intel_raid_conf *meta;
+	int disk_pos;
 
 	sc = disk->d_softc;
 	md = sc->sc_md;
 	mdi = (struct g_raid_md_intel_object *)md;
 	meta = mdi->mdio_meta;
 	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
-	pdmeta = pd->pd_meta;
 
-	if (pdmeta->generation != meta->generation) {
+	/* Find disk position in metadata by it's serial. */
+	disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial);
+	if (disk_pos < 0) {
+		G_RAID_DEBUG(1, "Unknown, probably stale disk");
 		g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
 		return;
 	}
 
-	/* Update disk state. */
-	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	/* Find placeholder by position. */
+	olddisk = g_raid_md_intel_get_disk(sc, disk_pos);
+	if (olddisk == NULL)
+		panic("No disk at position %d!", disk_pos);
+	if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
+		G_RAID_DEBUG(1, "More then one disk for pos %d", disk_pos);
+		return;
+	}
+	oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
 
-	/* Create subdisks. */
-	for (i = 0; i < meta->total_volumes; i++) {
-		mvol = intel_get_volume(meta, i);
-		mmap = intel_get_map(mvol, 0);
-		for (j = 0; j < mmap->total_disks; j++) {
-			if ((mmap->disk_idx[j] & INTEL_DI_IDX) == pd->pd_disk_pos)
-				break;
-		}
-		if (j == mmap->total_disks)
-			continue;
-		vol = g_raid_md_intel_get_volume(sc, i);
-		sd = &vol->v_subdisks[j];
-		sd->sd_disk = disk;
-		sd->sd_offset = mmap->offset * 512; //ZZZ
-		sd->sd_size = mmap->disk_sectors;
-		LIST_INSERT_HEAD(&disk->d_subdisks, sd, sd_next);
+	/* Merge real disk and placeholder and destroy one of them. */
+	disk->d_consumer->private = olddisk;
+	olddisk->d_consumer = disk->d_consumer;
+	disk->d_consumer = NULL;
+	oldpd->pd_meta = pd->pd_meta;
+	pd->pd_meta = NULL;
+	g_raid_destroy_disk(disk);
+	disk = olddisk;
+
+	/* Welcome the "new" disk. */
+	g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
+	LIST_FOREACH(sd, &disk->d_subdisks, sd_next) {
 		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
 		    G_RAID_EVENT_SUBDISK);
 	}
-
 }
 
 static void
@@ -495,18 +510,20 @@ g_raid_md_intel_start(struct g_raid_soft
 {
 	struct g_raid_md_object *md;
 	struct g_raid_md_intel_object *mdi;
+	struct g_raid_md_intel_perdisk *pd;
 	struct intel_raid_conf *meta;
 	struct intel_raid_vol *mvol;
 	struct intel_raid_map *mmap;
 	struct g_raid_volume *vol;
-	struct g_raid_disk *disk;
-	int i;
+	struct g_raid_subdisk *sd;
+	struct g_raid_disk *disk, *tmpdisk;
+	int i, j, disk_pos;
 
 	md = sc->sc_md;
 	mdi = (struct g_raid_md_intel_object *)md;
 	meta = mdi->mdio_meta;
 
-	/* Create volumes */
+	/* Create volumes and subdisks. */
 	for (i = 0; i < meta->total_volumes; i++) {
 		mvol = intel_get_volume(meta, i);
 		mmap = intel_get_map(mvol, 0);
@@ -528,10 +545,43 @@ g_raid_md_intel_start(struct g_raid_soft
 		vol->v_disks_count = mmap->total_disks;
 		vol->v_mediasize = mvol->total_sectors * 512; //ZZZ
 		vol->v_sectorsize = 512; //ZZZ
+		for (j = 0; j < vol->v_disks_count; j++) {
+			sd = &vol->v_subdisks[j];
+			sd->sd_offset = mmap->offset * 512; //ZZZ
+			sd->sd_size = mmap->disk_sectors;
+		}
 		g_raid_start_volume(vol);
 	}
-	LIST_FOREACH(disk, &sc->sc_disks, d_next)
-		g_raid_md_intel_start_disk(disk);
+
+	/* Create disk placeholders to store data for later writing. */
+	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
+		pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
+		pd->pd_disk_pos = disk_pos;
+		pd->pd_disk_meta = meta->disk[disk_pos];
+		disk = g_raid_create_disk(sc);
+		disk->d_md_data = (void *)pd;
+		g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+		for (i = 0; i < meta->total_volumes; i++) {
+			mvol = intel_get_volume(meta, i);
+			mmap = intel_get_map(mvol, 0);
+			for (j = 0; j < mmap->total_disks; j++) {
+				if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos)
+					break;
+			}
+			if (j == mmap->total_disks)
+				continue;
+			vol = g_raid_md_intel_get_volume(sc, i);
+			sd = &vol->v_subdisks[j];
+			sd->sd_disk = disk;
+			LIST_INSERT_HEAD(&disk->d_subdisks, sd, sd_next);
+		}
+	}
+
+	/* Make existing disks take their places. */
+	LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpdisk) {
+		if (disk->d_state == G_RAID_DISK_S_NONE)
+			g_raid_md_intel_start_disk(disk);
+	}
 
 	mdi->mdio_started = 1;
 	callout_stop(&mdi->mdio_start_co);
@@ -546,7 +596,7 @@ g_raid_md_intel_new_disk(struct g_raid_d
 	struct g_raid_softc *sc;
 	struct g_raid_md_object *md;
 	struct g_raid_md_intel_object *mdi;
-	struct intel_raid_conf *meta, *pdmeta;
+	struct intel_raid_conf *pdmeta;
 	struct g_raid_md_intel_perdisk *pd;
 
 	sc = disk->d_softc;
@@ -555,30 +605,27 @@ g_raid_md_intel_new_disk(struct g_raid_d
 	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
 	pdmeta = pd->pd_meta;
 
-	if (mdi->mdio_meta == NULL ||
-	    pdmeta->generation > mdi->mdio_meta->generation) {
-		if (mdi->mdio_started) {
-			G_RAID_DEBUG(1, "Newer disk, but already started");
-		} else {
+	if (mdi->mdio_started) {
+		g_raid_md_intel_start_disk(disk);
+	} else {
+		/* If we haven't started yet - check metadata freshness. */
+		if (mdi->mdio_meta == NULL ||
+		    pdmeta->generation > mdi->mdio_generation) {
 			G_RAID_DEBUG(1, "Newer disk");
 			if (mdi->mdio_meta != NULL)
 				free(mdi->mdio_meta, M_MD_INTEL);
 			mdi->mdio_meta = intel_meta_copy(pdmeta);
+			mdi->mdio_generation = mdi->mdio_meta->generation;
 			mdi->mdio_disks_present = 1;
+		} else if (pdmeta->generation == mdi->mdio_generation) {
+			mdi->mdio_disks_present++;
+			G_RAID_DEBUG(1, "Matching disk (%d up)",
+			    mdi->mdio_disks_present);
+		} else {
+			G_RAID_DEBUG(1, "Older disk");
 		}
-	} else if (pdmeta->generation == mdi->mdio_meta->generation) {
-		mdi->mdio_disks_present++;
-		G_RAID_DEBUG(1, "Matching disk (%d up)",
-		    mdi->mdio_disks_present);
-	} else {
-		G_RAID_DEBUG(1, "Stale disk");
-	}
-
-	meta = mdi->mdio_meta;
-	if (mdi->mdio_started) {
-		g_raid_md_intel_start_disk(disk);
-	} else {
-		if (mdi->mdio_disks_present == meta->total_disks)
+		/* If we collected all needed disks - start array. */
+		if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks)
 			g_raid_md_intel_start(sc);
 	}
 }
@@ -611,6 +658,7 @@ g_raid_md_create_intel(struct g_raid_md_
 
 	mdi = (struct g_raid_md_intel_object *)md;
 	mdi->mdio_config_id = arc4random();
+	mdi->mdio_generation = 0;
 	snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id);
 	sc = g_raid_create_node(mp, name, md);
 	if (sc == NULL)
@@ -686,22 +734,16 @@ g_raid_md_taste_intel(struct g_raid_md_o
 		return (G_RAID_MD_TASTE_FAIL);
 
 	/* Check this disk position in obtained metadata. */
-	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
-		if (strncmp(meta->disk[disk_pos].serial, serial, sizeof(serial))) {
-			G_RAID_DEBUG(1, "Intel serial mismatch '%s' '%s'",
-			    meta->disk[disk_pos].serial, serial);
-			continue;
-		}
-		if (meta->disk[disk_pos].sectors !=
-		    (pp->mediasize / pp->sectorsize)) {
-			G_RAID_DEBUG(1, "Intel size mismatch '%u' '%u'",
-			    meta->disk[disk_pos].sectors, (u_int)(pp->mediasize / pp->sectorsize));
-			continue;
-		}
-		break;
+	disk_pos = intel_meta_find_disk(meta, serial);
+	if (disk_pos < 0) {
+		G_RAID_DEBUG(1, "Intel serial '%s' not found", serial);
+		goto fail1;
 	}
-	if (disk_pos >= meta->total_disks) {
-		G_RAID_DEBUG(1, "Intel disk params check failed on %s", pp->name);
+	if (meta->disk[disk_pos].sectors !=
+	    (pp->mediasize / pp->sectorsize)) {
+		G_RAID_DEBUG(1, "Intel size mismatch %u != %u",
+		    meta->disk[disk_pos].sectors,
+		    (u_int)(pp->mediasize / pp->sectorsize));
 		goto fail1;
 	}
 
@@ -756,7 +798,7 @@ g_raid_md_taste_intel(struct g_raid_md_o
 
 	pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
 	pd->pd_meta = meta;
-	pd->pd_disk_pos = disk_pos;
+	pd->pd_disk_pos = -1;
 	pd->pd_disk_meta = meta->disk[disk_pos];
 	disk = g_raid_create_disk(sc);
 	disk->d_md_data = (void *)pd;
@@ -782,13 +824,33 @@ g_raid_md_event_intel(struct g_raid_md_o
     struct g_raid_disk *disk, u_int event)
 {
 	struct g_raid_softc *sc;
+	struct g_raid_subdisk *sd;
+	struct g_raid_md_intel_perdisk *pd;
 
 	sc = md->mdo_softc;
+	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
 	switch (event) {
 	case G_RAID_DISK_E_DISCONNECTED:
-		g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
-		g_raid_destroy_disk(disk);
-		if (g_raid_ndisks(sc, -1) == 0)
+		/* If disk was assigned, just update statuses. */
+		if (pd->pd_disk_pos >= 0) {
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
+			if (disk->d_consumer) {
+				g_topology_lock();
+				g_raid_kill_consumer(sc, disk->d_consumer);
+				g_topology_unlock();
+				disk->d_consumer = NULL;
+			}
+			LIST_FOREACH(sd, &disk->d_subdisks, sd_next) {
+				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
+				    G_RAID_EVENT_SUBDISK);
+			}
+		} else {
+			/* Otherwise -- delete. */
+			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
+			g_raid_destroy_disk(disk);
+		}
+		if (g_raid_ndisks(sc, G_RAID_DISK_S_NONE) == 0 &&
+		    g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) == 0)
 			g_raid_destroy_node(sc, 0);
 		break;
 	}
@@ -846,8 +908,9 @@ g_raid_md_ctl_intel(struct g_raid_md_obj
 			gctl_error(req, "Unsupported RAID level.");
 			return (-5);
 		}
+
+		/* Search for disks, connect them and probe. */
 		numdisks = *nargs - 3;
-		mdi->mdio_generation = 1;
 		error = 0;
 		size = 0xffffffffffffffffllu;
 		sectorsize = 0;
@@ -921,6 +984,7 @@ g_raid_md_ctl_intel(struct g_raid_md_obj
 		/* Reserve some space for metadata. */
 		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
 
+		/* Handle size argument. */
 		len = sizeof(*sizearg);
 		sizearg = gctl_get_param(req, "size", &len);
 		if (sizearg != NULL && len == sizeof(*sizearg)) {
@@ -931,6 +995,7 @@ g_raid_md_ctl_intel(struct g_raid_md_obj
 			size = *sizearg;
 		}
 
+		/* Handle strip argument. */
 		strip = 131072;
 		len = sizeof(*striparg);
 		striparg = gctl_get_param(req, "strip", &len);
@@ -949,8 +1014,10 @@ g_raid_md_ctl_intel(struct g_raid_md_obj
 			}
 			strip = *striparg;
 		}
-
 		size -= (size % strip);
+
+		/* We have all we need, create things: volume, ... */
+		mdi->mdio_started = 1;
 		vol = g_raid_create_volume(sc, volname);
 		vol->v_md_data = (void *)(intptr_t)0;
 		vol->v_raid_level = level;
@@ -966,6 +1033,7 @@ g_raid_md_ctl_intel(struct g_raid_md_obj
 		vol->v_sectorsize = sectorsize;
 		g_raid_start_volume(vol);
 
+		/* , and subdisks. */
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
 			sd = &vol->v_subdisks[pd->pd_disk_pos];
@@ -977,6 +1045,8 @@ g_raid_md_ctl_intel(struct g_raid_md_obj
 			g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
 			    G_RAID_EVENT_SUBDISK);
 		}
+
+		/* Write metadata based on created entities. */
 		g_raid_md_write_intel(md);
 		return (0);
 	}
@@ -1002,12 +1072,15 @@ g_raid_md_write_intel(struct g_raid_md_o
 	sc = md->mdo_softc;
 	mdi = (struct g_raid_md_intel_object *)md;
 
+	/* Bump generation, as written metadata may differ from previous. */
+	mdi->mdio_generation++;
+
 	/* Count number of disks. */
 	numdisks = 0;
 	i = 0;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
-		if (pd->pd_disk_meta.flags & INTEL_F_ASSIGNED) {
+		if (pd->pd_disk_pos >= 0) {
 			numdisks++;
 			pd->pd_disk_pos = i++;
 		}
@@ -1017,8 +1090,6 @@ g_raid_md_write_intel(struct g_raid_md_o
 	meta = malloc(INTEL_MAX_MD_SIZE(numdisks),
 	    M_MD_INTEL, M_WAITOK | M_ZERO);
 	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC));
-	memcpy(&meta->version[0], INTEL_VERSION_1200,
-	    sizeof(INTEL_VERSION_1200));
 	meta->config_size = INTEL_MAX_MD_SIZE(numdisks);
 	meta->config_id = mdi->mdio_config_id;
 	meta->generation = mdi->mdio_generation;
@@ -1038,6 +1109,9 @@ g_raid_md_write_intel(struct g_raid_md_o
 		mvol = intel_get_volume(meta, vi);
 		mmap = intel_get_map(mvol, 0);
 
+		/* New metadata may have different volumes order. */
+		vol->v_md_data = (void *)(intptr_t)vi;
+
 		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
 			sd = &vol->v_subdisks[sdi];
 			if (sd->sd_disk != NULL)


More information about the svn-src-projects mailing list