rdr pass for proto tcp sometimes creates states with expire time zero and so breaking connections

Konstantin Belousov kib at freebsd.org
Fri Jan 25 13:14:18 UTC 2019


On Thu, Jan 24, 2019 at 11:09:37PM +0100, Andreas Longwitz wrote:
> >> I think the problem is the cmpxchg8b instruction used in
> >> counter_u64_fetch(), because this machine instruction always writes to
> >> memory, also when we only want to read and have (EDX:EAX) = (ECX:EBX):
> >>
> >>     TEMP64 <- DEST
> >>     IF (EDX:EAX = TEMP64)
> >>        THEN
> >>           ZF <- 1
> >>           DEST <- ECX:EBX
> >>        ELSE
> >>           ZF <- 0
> >>           EDX:EAX <- TEMP64
> >>           DEST <- TEMP64
> >>     FI
> >>
> >> If one CPU increments the counter in pf_create_state() and another does
> >> the fetch, then both CPU's may run the xmpxschg8b at once with the
> >> chance that both read the same memory value in TEMP64 and the fetching
> >> CPU is the second CPU that writes and so the increment is lossed. Thats
> >> what I see without the above patch two or three times a week.
> > 
> > Please try the following patch.  The idea is to make the value to compare
> > with unlikely to be equal to the memory content, for fetch_one().
> 
> During my research I first had the same idea, but it did not work. In
> the actual coding eax/edx is not well defined before cmpxchg8b is
> executed, but it does not help for the problem to do so.
> 
> > Also it fixes a silly bug in zero_one().
> > 
> > diff --git a/sys/i386/include/counter.h b/sys/i386/include/counter.h
> > index 7fd26d2a960..aa20831ba18 100644
> > --- a/sys/i386/include/counter.h
> > +++ b/sys/i386/include/counter.h
> > @@ -78,6 +78,9 @@ counter_u64_read_one_8b(uint64_t *p)
> >  	uint32_t res_lo, res_high;
> >  
> >  	__asm __volatile(
> > +	"movl	(%0),%%eax\n\t"
> > +	"movl	4(%0),%%edx\n\t"
> > +	"addl	$0x10000000,%%edx\n\t"	/* avoid write */
> >  	"movl	%%eax,%%ebx\n\t"
> >  	"movl	%%edx,%%ecx\n\t"
> >  	"cmpxchg8b	(%2)"
> 
> We can not avoid the write done by cmpxchg8b as can be seen from the
> microcode given above, we always end up with "DEST <- TEMP". From the
> Intel instruction reference manual:
> 
> The destination operand is written back if the comparision fails. (The
> processor never produces a locked read without also producing a locked
> write).
I see, AMD APM is more clear there, stating that the instruction always
do rmw regardless of lock prefix.

> 
> Maybe it is enough to prefix the cmpxchg8b with LOCK only in function
> counter_u64_read_one_8b().
I am not sure.  Lets switch to IPI method for fetch, similar to clear.
I do not think that the cost of fetch is too important comparing with
the race.

> 
> 
> > @@ -120,11 +123,11 @@ counter_u64_zero_one_8b(uint64_t *p)
> >  {
> >  	__asm __volatile(
> > +"\n1:\n\t"
> >  	"movl	(%0),%%eax\n\t"
> > -	"movl	4(%0),%%edx\n"
> > +	"movl	4(%0),%%edx\n\t"
> >  	"xorl	%%ebx,%%ebx\n\t"
> >  	"xorl	%%ecx,%%ecx\n\t"
> > -"1:\n\t"
> >  	"cmpxchg8b	(%0)\n\t"
> >  	"jnz	1b"
> >  	:
> 
> If jnz jumps back the instruction cmpxchg8b has load registers eax/edx
> with (%0), therefor I do not understand the silly bug.
Ignore me.


diff --git a/sys/i386/include/counter.h b/sys/i386/include/counter.h
index 7fd26d2a960..278f89123a4 100644
--- a/sys/i386/include/counter.h
+++ b/sys/i386/include/counter.h
@@ -72,7 +72,12 @@ counter_64_inc_8b(uint64_t *p, int64_t inc)
 }
 
 #ifdef IN_SUBR_COUNTER_C
-static inline uint64_t
+struct counter_u64_fetch_cx8_arg {
+	uint64_t res;
+	uint64_t *p;
+};
+
+static uint64_t
 counter_u64_read_one_8b(uint64_t *p)
 {
 	uint32_t res_lo, res_high;
@@ -87,9 +92,22 @@ counter_u64_read_one_8b(uint64_t *p)
 	return (res_lo + ((uint64_t)res_high << 32));
 }
 
+static void
+counter_u64_fetch_cx8_one(void *arg1)
+{
+	struct counter_u64_fetch_cx8_arg *arg;
+	uint64_t val;
+
+	arg = arg1;
+	val = counter_u64_read_one_8b((uint64_t *)((char *)arg->p +
+	    UMA_PCPU_ALLOC_SIZE * PCPU_GET(cpuid)));
+	atomic_add_64(&arg->res, val);
+}
+
 static inline uint64_t
 counter_u64_fetch_inline(uint64_t *p)
 {
+	struct counter_u64_fetch_cx8_arg arg;
 	uint64_t res;
 	int i;
 
@@ -108,9 +126,10 @@ counter_u64_fetch_inline(uint64_t *p)
 		}
 		critical_exit();
 	} else {
-		CPU_FOREACH(i)
-			res += counter_u64_read_one_8b((uint64_t *)((char *)p +
-			    UMA_PCPU_ALLOC_SIZE * i));
+		arg.p = p;
+		arg.res = 0;
+		smp_rendezvous(NULL, counter_u64_fetch_cx8_one, NULL, &arg);
+		res = arg.res;
 	}
 	return (res);
 }


More information about the freebsd-pf mailing list