svn commit: r220898 - head/sbin/hastd

Pawel Jakub Dawidek pjd at FreeBSD.org
Wed Apr 20 18:43:29 UTC 2011


Author: pjd
Date: Wed Apr 20 18:43:28 2011
New Revision: 220898
URL: http://svn.freebsd.org/changeset/base/220898

Log:
  When we become primary, we connect to the remote and expect it to be in
  secondary role. It is possible that the remote node is primary, but only
  because there was a role change and it didn't finish cleaning up (unmounting
  file systems, etc.). If we detect such situation, wait for the remote node
  to switch the role to secondary before accepting I/Os. If we don't wait for
  it in that case, we will most likely cause split-brain.
  
  MFC after:	1 week

Modified:
  head/sbin/hastd/hastd.c
  head/sbin/hastd/primary.c

Modified: head/sbin/hastd/hastd.c
==============================================================================
--- head/sbin/hastd/hastd.c	Wed Apr 20 18:04:34 2011	(r220897)
+++ head/sbin/hastd/hastd.c	Wed Apr 20 18:43:28 2011	(r220898)
@@ -736,6 +736,13 @@ listen_accept(void)
 		nv_add_stringf(nverr, "errmsg",
 		    "Remote node acts as %s for the resource and not as %s.",
 		    role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY));
+		if (res->hr_role == HAST_ROLE_PRIMARY) {
+			/*
+			 * If we act as primary request the other side to wait
+			 * for us for a bit, as may might be finishing cleanups.
+			 */
+			nv_add_uint8(nverr, 1, "wait");
+		}
 		goto fail;
 	}
 	/* Does token (if exists) match? */

Modified: head/sbin/hastd/primary.c
==============================================================================
--- head/sbin/hastd/primary.c	Wed Apr 20 18:04:34 2011	(r220897)
+++ head/sbin/hastd/primary.c	Wed Apr 20 18:43:28 2011	(r220898)
@@ -219,6 +219,7 @@ static pthread_cond_t range_regular_cond
 static struct rangelocks *range_sync;
 static bool range_sync_wait;
 static pthread_cond_t range_sync_cond;
+static bool fullystarted;
 
 static void *ggate_recv_thread(void *arg);
 static void *local_send_thread(void *arg);
@@ -524,7 +525,7 @@ primary_connect(struct hast_resource *re
 	return (0);
 }
 
-static bool
+static int
 init_remote(struct hast_resource *res, struct proto_conn **inp,
     struct proto_conn **outp)
 {
@@ -537,6 +538,7 @@ init_remote(struct hast_resource *res, s
 	int64_t datasize;
 	uint32_t mapsize;
 	size_t size;
+	int error;
 
 	PJDLOG_ASSERT((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL));
 	PJDLOG_ASSERT(real_remote(res));
@@ -545,7 +547,9 @@ init_remote(struct hast_resource *res, s
 	errmsg = NULL;
 
 	if (primary_connect(res, &out) == -1)
-		return (false);
+		return (ECONNREFUSED);
+
+	error = ECONNABORTED;
 
 	/*
 	 * First handshake step.
@@ -577,6 +581,8 @@ init_remote(struct hast_resource *res, s
 	errmsg = nv_get_string(nvin, "errmsg");
 	if (errmsg != NULL) {
 		pjdlog_warning("%s", errmsg);
+		if (nv_exists(nvin, "wait"))
+			error = EBUSY;
 		nv_free(nvin);
 		goto close;
 	}
@@ -734,14 +740,14 @@ init_remote(struct hast_resource *res, s
 		res->hr_remoteout = out;
 	}
 	event_send(res, EVENT_CONNECT);
-	return (true);
+	return (0);
 close:
 	if (errmsg != NULL && strcmp(errmsg, "Split-brain condition!") == 0)
 		event_send(res, EVENT_SPLITBRAIN);
 	proto_close(out);
 	if (in != NULL)
 		proto_close(in);
-	return (false);
+	return (error);
 }
 
 static void
@@ -920,8 +926,30 @@ hastd_primary(struct hast_resource *res)
 	 */
 	error = pthread_create(&td, NULL, ctrl_thread, res);
 	PJDLOG_ASSERT(error == 0);
-	if (real_remote(res) && init_remote(res, NULL, NULL))
-		sync_start();
+	if (real_remote(res)) {
+		error = init_remote(res, NULL, NULL);
+		if (error == 0) {
+			sync_start();
+		} else if (error == EBUSY) {
+			time_t start = time(NULL);
+
+			pjdlog_warning("Waiting for remote node to become %s for %ds.",
+			    role2str(HAST_ROLE_SECONDARY),
+			    res->hr_timeout);
+			for (;;) {
+				sleep(1);
+				error = init_remote(res, NULL, NULL);
+				if (error != EBUSY)
+					break;
+				if (time(NULL) > start + res->hr_timeout)
+					break;
+			}
+			if (error == EBUSY) {
+				pjdlog_warning("Remote node is still %s, starting anyway.",
+				    role2str(HAST_ROLE_PRIMARY));
+			}
+		}
+	}
 	error = pthread_create(&td, NULL, ggate_recv_thread, res);
 	PJDLOG_ASSERT(error == 0);
 	error = pthread_create(&td, NULL, local_send_thread, res);
@@ -932,6 +960,7 @@ hastd_primary(struct hast_resource *res)
 	PJDLOG_ASSERT(error == 0);
 	error = pthread_create(&td, NULL, ggate_send_thread, res);
 	PJDLOG_ASSERT(error == 0);
+	fullystarted = true;
 	(void)sync_thread(res);
 }
 
@@ -2095,7 +2124,7 @@ guard_one(struct hast_resource *res, uns
 	pjdlog_debug(2, "remote_guard: Reconnecting to %s.",
 	    res->hr_remoteaddr);
 	in = out = NULL;
-	if (init_remote(res, &in, &out)) {
+	if (init_remote(res, &in, &out) == 0) {
 		rw_wlock(&hio_remote_lock[ncomp]);
 		PJDLOG_ASSERT(res->hr_remotein == NULL);
 		PJDLOG_ASSERT(res->hr_remoteout == NULL);
@@ -2153,12 +2182,19 @@ guard_thread(void *arg)
 			break;
 		}
 
-		pjdlog_debug(2, "remote_guard: Checking connections.");
-		now = time(NULL);
-		if (lastcheck + HAST_KEEPALIVE <= now) {
-			for (ii = 0; ii < ncomps; ii++)
-				guard_one(res, ii);
-			lastcheck = now;
+		/*
+		 * Don't check connections until we fully started,
+		 * as we may still be looping, waiting for remote node
+		 * to switch from primary to secondary.
+		 */
+		if (fullystarted) {
+			pjdlog_debug(2, "remote_guard: Checking connections.");
+			now = time(NULL);
+			if (lastcheck + HAST_KEEPALIVE <= now) {
+				for (ii = 0; ii < ncomps; ii++)
+					guard_one(res, ii);
+				lastcheck = now;
+			}
 		}
 		signo = sigtimedwait(&mask, NULL, &timeout);
 	}


More information about the svn-src-all mailing list