svn commit: r207371 - head/sbin/hastd

Pawel Jakub Dawidek pjd at FreeBSD.org
Thu Apr 29 15:36:33 UTC 2010


Author: pjd
Date: Thu Apr 29 15:36:32 2010
New Revision: 207371
URL: http://svn.freebsd.org/changeset/base/207371

Log:
  Fix a problem where hastd will stuck in recv(2) after sending request to
  secondary, which died between send(2) and recv(2). Do it by adding timeout
  to recv(2) for primary incoming and outgoing sockets and secondary outgoing
  socket.
  
  Reported by:	Mikolaj Golub <to.my.trociny at gmail.com>
  Tested by:	Mikolaj Golub <to.my.trociny at gmail.com>
  MFC after:	3 days

Modified:
  head/sbin/hastd/hast.conf.5
  head/sbin/hastd/hast.h
  head/sbin/hastd/hastd.c
  head/sbin/hastd/parse.y
  head/sbin/hastd/primary.c
  head/sbin/hastd/proto.c
  head/sbin/hastd/proto.h
  head/sbin/hastd/proto_common.c
  head/sbin/hastd/secondary.c
  head/sbin/hastd/token.l

Modified: head/sbin/hastd/hast.conf.5
==============================================================================
--- head/sbin/hastd/hast.conf.5	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/hast.conf.5	Thu Apr 29 15:36:32 2010	(r207371)
@@ -58,6 +58,7 @@ file is following:
 control <addr>
 listen <addr>
 replication <mode>
+timeout <seconds>
 
 on <node> {
 	# Node section
@@ -76,6 +77,7 @@ resource <name> {
 	replication <mode>
 	name <name>
 	local <path>
+	timeout <seconds>
 
 	on <node> {
 		# Resource-node section
@@ -194,6 +196,11 @@ The
 .Ic async
 replication mode is currently not implemented.
 .El
+.It Ic timeout Aq seconds
+.Pp
+Connection timeout in seconds.
+The default value is
+.Va 5 .
 .It Ic name Aq name
 .Pp
 GEOM provider name that will appear as

Modified: head/sbin/hastd/hast.h
==============================================================================
--- head/sbin/hastd/hast.h	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/hast.h	Thu Apr 29 15:36:32 2010	(r207371)
@@ -75,6 +75,7 @@
 #define	HIO_DELETE		3
 #define	HIO_FLUSH		4
 
+#define	HAST_TIMEOUT	5
 #define	HAST_CONFIG	"/etc/hast.conf"
 #define	HAST_CONTROL	"/var/run/hastctl"
 #define	HASTD_PORT	8457
@@ -148,6 +149,8 @@ struct hast_resource {
 	/* Token to verify both in and out connection are coming from
 	   the same node (not necessarily from the same address). */
 	unsigned char hr_token[HAST_TOKEN_SIZE];
+	/* Connection timeout. */
+	int	hr_timeout;
 
 	/* Resource unique identifier. */
 	uint64_t hr_resuid;

Modified: head/sbin/hastd/hastd.c
==============================================================================
--- head/sbin/hastd/hastd.c	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/hastd.c	Thu Apr 29 15:36:32 2010	(r207371)
@@ -187,6 +187,10 @@ listen_accept(void)
 	proto_remote_address(conn, raddr, sizeof(raddr));
 	pjdlog_info("Connection from %s to %s.", laddr, raddr);
 
+	/* Error in setting timeout is not critical, but why should it fail? */
+	if (proto_timeout(conn, HAST_TIMEOUT) < 0)
+		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
 	nvin = nvout = nverr = NULL;
 
 	/*

Modified: head/sbin/hastd/parse.y
==============================================================================
--- head/sbin/hastd/parse.y	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/parse.y	Thu Apr 29 15:36:32 2010	(r207371)
@@ -58,6 +58,7 @@ static bool mynode;
 static char depth0_control[HAST_ADDRSIZE];
 static char depth0_listen[HAST_ADDRSIZE];
 static int depth0_replication;
+static int depth0_timeout;
 
 static char depth1_provname[PATH_MAX];
 static char depth1_localpath[PATH_MAX];
@@ -115,6 +116,7 @@ yy_config_parse(const char *config)
 	curres = NULL;
 	mynode = false;
 
+	depth0_timeout = HAST_TIMEOUT;
 	depth0_replication = HAST_REPLICATION_MEMSYNC;
 	strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control));
 	strlcpy(depth0_listen, HASTD_LISTEN, sizeof(depth0_listen));
@@ -154,6 +156,13 @@ yy_config_parse(const char *config)
 			 */
 			curres->hr_replication = depth0_replication;
 		}
+		if (curres->hr_timeout == -1) {
+			/*
+			 * Timeout is not set at resource-level.
+			 * Use global or default setting.
+			 */
+			curres->hr_timeout = depth0_timeout;
+		}
 	}
 
 	return (&lconfig);
@@ -171,7 +180,7 @@ yy_config_free(struct hastd_config *conf
 }
 %}
 
-%token CONTROL LISTEN PORT REPLICATION EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
+%token CONTROL LISTEN PORT REPLICATION TIMEOUT EXTENTSIZE RESOURCE NAME LOCAL REMOTE ON
 %token FULLSYNC MEMSYNC ASYNC
 %token NUM STR OB CB
 
@@ -200,6 +209,8 @@ statement:
 	|
 	replication_statement
 	|
+	timeout_statement
+	|
 	node_statement
 	|
 	resource_statement
@@ -281,6 +292,22 @@ replication_type:
 	ASYNC		{ $$ = HAST_REPLICATION_ASYNC; }
 	;
 
+timeout_statement:	TIMEOUT NUM
+	{
+		switch (depth) {
+		case 0:
+			depth0_timeout = $2;
+			break;
+		case 1:
+			if (curres != NULL)
+				curres->hr_timeout = $2;
+			break;
+		default:
+			assert(!"timeout at wrong depth level");
+		}
+	}
+	;
+
 node_statement:		ON node_start OB node_entries CB
 	{
 		mynode = false;
@@ -389,6 +416,7 @@ resource_start:	STR
 		curres->hr_role = HAST_ROLE_INIT;
 		curres->hr_previous_role = HAST_ROLE_INIT;
 		curres->hr_replication = -1;
+		curres->hr_timeout = -1;
 		curres->hr_provname[0] = '\0';
 		curres->hr_localpath[0] = '\0';
 		curres->hr_localfd = -1;
@@ -405,6 +433,8 @@ resource_entries:
 resource_entry:
 	replication_statement
 	|
+	timeout_statement
+	|
 	name_statement
 	|
 	local_statement

Modified: head/sbin/hastd/primary.c
==============================================================================
--- head/sbin/hastd/primary.c	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/primary.c	Thu Apr 29 15:36:32 2010	(r207371)
@@ -489,6 +489,9 @@ init_remote(struct hast_resource *res, s
 		    res->hr_remoteaddr);
 		goto close;
 	}
+	/* Error in setting timeout is not critical, but why should it fail? */
+	if (proto_timeout(out, res->hr_timeout) < 0)
+		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
 	/*
 	 * First handshake step.
 	 * Setup outgoing connection with remote node.
@@ -552,6 +555,9 @@ init_remote(struct hast_resource *res, s
 		    res->hr_remoteaddr);
 		goto close;
 	}
+	/* Error in setting timeout is not critical, but why should it fail? */
+	if (proto_timeout(in, res->hr_timeout) < 0)
+		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
 	nvout = nv_alloc();
 	nv_add_string(nvout, res->hr_name, "resource");
 	nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token),

Modified: head/sbin/hastd/proto.c
==============================================================================
--- head/sbin/hastd/proto.c	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/proto.c	Thu Apr 29 15:36:32 2010	(r207371)
@@ -30,7 +30,9 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <sys/types.h>
 #include <sys/queue.h>
+#include <sys/socket.h>
 
 #include <assert.h>
 #include <errno.h>
@@ -247,6 +249,30 @@ proto_remote_address(const struct proto_
 	conn->pc_proto->hp_remote_address(conn->pc_ctx, addr, size);
 }
 
+int
+proto_timeout(const struct proto_conn *conn, int timeout)
+{
+	struct timeval tv;
+	int fd;
+
+	assert(conn != NULL);
+	assert(conn->pc_magic == PROTO_CONN_MAGIC);
+	assert(conn->pc_proto != NULL);
+
+	fd = proto_descriptor(conn);
+	if (fd < 0)
+		return (-1);
+
+	tv.tv_sec = timeout;
+	tv.tv_usec = 0;
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)) < 0)
+		return (-1);
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0)
+		return (-1);
+
+	return (0);
+}
+
 void
 proto_close(struct proto_conn *conn)
 {

Modified: head/sbin/hastd/proto.h
==============================================================================
--- head/sbin/hastd/proto.h	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/proto.h	Thu Apr 29 15:36:32 2010	(r207371)
@@ -49,6 +49,7 @@ void proto_local_address(const struct pr
     size_t size);
 void proto_remote_address(const struct proto_conn *conn, char *addr,
     size_t size);
+int proto_timeout(const struct proto_conn *conn, int timeout);
 void proto_close(struct proto_conn *conn);
 
 #endif	/* !_PROTO_H_ */

Modified: head/sbin/hastd/proto_common.c
==============================================================================
--- head/sbin/hastd/proto_common.c	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/proto_common.c	Thu Apr 29 15:36:32 2010	(r207371)
@@ -58,7 +58,7 @@ proto_common_send(int fd, const unsigned
 		if (done == 0)
 			return (ENOTCONN);
 		else if (done < 0) {
-			if (errno == EAGAIN)
+			if (errno == EINTR)
 				continue;
 			return (errno);
 		}
@@ -76,7 +76,7 @@ proto_common_recv(int fd, unsigned char 
 
 	do {
 		done = recv(fd, data, size, MSG_WAITALL);
-	} while (done == -1 && errno == EAGAIN);
+	} while (done == -1 && errno == EINTR);
 	if (done == 0)
 		return (ENOTCONN);
 	else if (done < 0)

Modified: head/sbin/hastd/secondary.c
==============================================================================
--- head/sbin/hastd/secondary.c	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/secondary.c	Thu Apr 29 15:36:32 2010	(r207371)
@@ -337,6 +337,12 @@ hastd_secondary(struct hast_resource *re
 
 	setproctitle("%s (secondary)", res->hr_name);
 
+	/* Error in setting timeout is not critical, but why should it fail? */
+	if (proto_timeout(res->hr_remotein, 0) < 0)
+		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+	if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0)
+		pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
+
 	init_local(res);
 	init_remote(res, nvin);
 	init_environment();

Modified: head/sbin/hastd/token.l
==============================================================================
--- head/sbin/hastd/token.l	Thu Apr 29 15:19:11 2010	(r207370)
+++ head/sbin/hastd/token.l	Thu Apr 29 15:36:32 2010	(r207371)
@@ -48,6 +48,7 @@ control			{ DP; return CONTROL; }
 listen			{ DP; return LISTEN; }
 port			{ DP; return PORT; }
 replication		{ DP; return REPLICATION; }
+timeout			{ DP; return TIMEOUT; }
 resource		{ DP; return RESOURCE; }
 name			{ DP; return NAME; }
 local			{ DP; return LOCAL; }


More information about the svn-src-head mailing list