svn commit: r251681 - head/sys/netpfil/pf

Thu Jun 13 06:07:20 UTC 2013

Author: glebius
Date: Thu Jun 13 06:07:19 2013
New Revision: 251681
URL: http://svnweb.freebsd.org/changeset/base/251681

Log:
  Improve locking strategy between keys hash and ID hash.
  
  Before this change state creating sequence was:
  
  1) lock wire key hash
  2) link state's wire key
  3) unlock wire key hash
  4) lock stack key hash
  5) link state's stack key
  6) unlock stack key hash
  7) lock ID hash
  8) link into ID hash
  9) unlock ID hash
  
  What could happen here is that other thread finds the state via key
  hash lookup after 6), locks ID hash and does some processing of the
  state. When the thread creating state unblocks, it finds the state
  it was inserting already non-virgin.
  
  Now we perform proper interlocking between key hash locks and ID hash
  lock:
  
  1) lock wire & stack hashes
  2) link state's keys
  3) lock ID hash
  4) unlock wire & stack hashes
  5) link into ID hash
  6) unlock ID hash
  
  To achieve that, the following hacking was performed in pf_state_key_attach():
  
  - Key hash mutex is marked with MTX_DUPOK.
  - To avoid deadlock on 2 key hash mutexes, we lock them in order determined
    by their address value.
  - pf_state_key_attach() had a magic to reuse a > FIN_WAIT_2 state. It unlinked
    the conflicting state synchronously. In theory this could require locking
    a third key hash, which we can't do now.
    Now we do not remove the state immediately, instead we leave this task to
    the purge thread. To avoid conflicts in a short period before state is
    purged, we push to the very end of the TAILQ.
  - On success, before dropping key hash locks, pf_state_key_attach() locks
    ID hash and returns.
  
  Tested by:	Ian FREISLICH <ianf clue.co.za>

Modified:
  head/sys/netpfil/pf/pf.c

Modified: head/sys/netpfil/pf/pf.c
==============================================================================

--- head/sys/netpfil/pf/pf.c	Thu Jun 13 05:51:59 2013	(r251680)
+++ head/sys/netpfil/pf/pf.c	Thu Jun 13 06:07:19 2013	(r251681)
@@ -724,7 +724,7 @@ pf_initialize()
 	V_pf_hashmask = V_pf_hashsize - 1;
 	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
 	    i++, kh++, ih++) {
-		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF);
+		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
 		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
 	}
 
@@ -851,7 +851,7 @@ static int
 pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
     struct pf_state *s)
 {
-	struct pf_keyhash	*kh;
+	struct pf_keyhash	*khs, *khw, *kh;
 	struct pf_state_key	*sk, *cur;
 	struct pf_state		*si, *olds = NULL;
 	int idx;
@@ -861,15 +861,47 @@ pf_state_key_attach(struct pf_state_key 
 	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
 
 	/*
+	 * We need to lock hash slots of both keys. To avoid deadlock
+	 * we always lock the slot with lower address first. Unlock order
+	 * isn't important.
+	 *
+	 * We also need to lock ID hash slot before dropping key
+	 * locks. On success we return with ID hash slot locked.
+	 */
+
+	if (skw == sks) {
+		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
+		PF_HASHROW_LOCK(khs);
+	} else {
+		khs = &V_pf_keyhash[pf_hashkey(sks)];
+		khw = &V_pf_keyhash[pf_hashkey(skw)];
+		if (khs == khw) {
+			PF_HASHROW_LOCK(khs);
+		} else if (khs < khw) {
+			PF_HASHROW_LOCK(khs);
+			PF_HASHROW_LOCK(khw);
+		} else {
+			PF_HASHROW_LOCK(khw);
+			PF_HASHROW_LOCK(khs);
+		}
+	}
+
+#define	KEYS_UNLOCK()	do {			\
+	if (khs != khw) {			\
+		PF_HASHROW_UNLOCK(khs);		\
+		PF_HASHROW_UNLOCK(khw);		\
+	} else					\
+		PF_HASHROW_UNLOCK(khs);		\
+} while (0)
+
+	/*
 	 * First run: start with wire key.
 	 */
 	sk = skw;
+	kh = khw;
 	idx = PF_SK_WIRE;
 
 keyattach:
-	kh = &V_pf_keyhash[pf_hashkey(sk)];
-
-	PF_HASHROW_LOCK(kh);
 	LIST_FOREACH(cur, &kh->keys, entry)
 		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
 			break;
@@ -885,10 +917,20 @@ keyattach:
 				if (sk->proto == IPPROTO_TCP &&
 				    si->src.state >= TCPS_FIN_WAIT_2 &&
 				    si->dst.state >= TCPS_FIN_WAIT_2) {
+					/*
+					 * New state matches an old >FIN_WAIT_2
+					 * state. We can't drop key hash locks,
+					 * thus we can't unlink it properly.
+					 *
+					 * As a workaround we drop it into
+					 * TCPS_CLOSED state, schedule purge
+					 * ASAP and push it into the very end
+					 * of the slot TAILQ, so that it won't
+					 * conflict with our new state.
+					 */
 					si->src.state = si->dst.state =
 					    TCPS_CLOSED;
-					/* Unlink later or cur can go away. */
-					pf_ref_state(si);
+					si->timeout = PFTM_PURGE;
 					olds = si;
 				} else {
 					if (V_pf_status.debug >= PF_DEBUG_MISC) {
@@ -911,7 +953,7 @@ keyattach:
 						printf("\n");
 					}
 					PF_HASHROW_UNLOCK(ih);
-					PF_HASHROW_UNLOCK(kh);
+					KEYS_UNLOCK();
 					uma_zfree(V_pf_state_key_z, sk);
 					if (idx == PF_SK_STACK)
 						pf_detach_state(s);
@@ -934,6 +976,13 @@ stateattach:
 	else
 		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
 
+	if (olds) {
+		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
+		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
+		    key_list[idx]);
+		olds = NULL;
+	}
+
 	/*
 	 * Attach done. See how should we (or should not?)
 	 * attach a second key.
@@ -944,31 +993,24 @@ stateattach:
 		sks = NULL;
 		goto stateattach;
 	} else if (sks != NULL) {
-		PF_HASHROW_UNLOCK(kh);
-		if (olds) {
-			pf_unlink_state(olds, 0);
-			pf_release_state(olds);
-			olds = NULL;
-		}
 		/*
 		 * Continue attaching with stack key.
 		 */
 		sk = sks;
+		kh = khs;
 		idx = PF_SK_STACK;
 		sks = NULL;
 		goto keyattach;
-	} else
-		PF_HASHROW_UNLOCK(kh);
-
-	if (olds) {
-		pf_unlink_state(olds, 0);
-		pf_release_state(olds);
 	}
 
+	PF_STATE_LOCK(s);
+	KEYS_UNLOCK();
+
 	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
 	    ("%s failure", __func__));
 
 	return (0);
+#undef	KEYS_UNLOCK
 }
 
 static void
@@ -1091,11 +1133,12 @@ pf_state_insert(struct pfi_kif *kif, str
 		s->creatorid = V_pf_status.hostid;
 	}
 
+	/* Returns with ID locked on success. */
 	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
 		return (error);
 
 	ih = &V_pf_idhash[PF_IDHASH(s)];
-	PF_HASHROW_LOCK(ih);
+	PF_HASHROW_ASSERT(ih);
 	LIST_FOREACH(cur, &ih->states, entry)
 		if (cur->id == s->id && cur->creatorid == s->creatorid)
 			break;