Fwd: Questions with the in_cksumdata() function in sys/amd64/amd64/in_cksum.c

Mon Oct 20 08:34:02 UTC 2014

On Mon, Oct 20, 2014 at 01:47:55PM +0800, Tiwei Bie wrote:
> > I would be not surprised if this manual prefetching by explicit reads
> > causes slowdown of the function.  I suspect it could confuse hardware
> > prefetcher by breaking the linear pattern, or the patch could break
> > the logic of the limited forward-looking oracle by reading too far
> > from the current linear read tip.
> > 
> > Also, it could confuse the data flow engine if the register allocator
> > is unable to see that the read value is needed not right now, and cause
> > unneeded stall while next cache line is fetched.
> > 
> > Sure, all my speculations are pure garbage until confirmed by
> > measurements with pmc, but I think that the patch below must be
> > benchmarked to confirm any value it provides as well. My opinion is,
> > we should either remove the manual prefetch, or do it with PREFETCHLX
> > instructions only, instead of real read.
> 
> I have done a rather simple test. And the results are listed as follows:
> 
Yes, too simple to draw conclusion, IMO.

Please look at the ministat(1).  I think that the test run length
is too short to come with any decisions.  The length x 3 runs does not
give enough confidence; but ministat would provide the numbers to judge.

> #1. Read 32 bytes with manual pre-read in each loop:
> 
> $ cc main.c -D_32BYTES_WITH_PRE_READ
> $ for i in `seq 3`; do ./a.out; done
> 0.768854
> 0.770332
> 0.773803
> 
> #2. Read 64 bytes with manual pre-read in each loop:
> 
> $ cc main.c -D_64BYTES_WITH_PRE_READ
> $ for i in `seq 3`; do ./a.out; done
> 0.702416
> 0.703648
> 0.704498
> 
> #3. Read 32 bytes without manual prefetch in each loop:
> 
> $ cc main.c -D_32BYTES_WITHOUT_MANUAL_PREFETCH
> $ for i in `seq 3`; do ./a.out; done
> 0.726651
> 0.728841
> 0.725956
> 
> #4. Read 64 bytes without manual prefetch in each loop:
> 
> $ cc main.c -D_64BYTES_WITHOUT_MANUAL_PREFETCH
> $ for i in `seq 3`; do ./a.out; done
> 0.698071
> 0.697971
> 0.698314
> 
> #5. Read 64 bytes with PREFETCH instruction:
> 
> $ cc main.c -D_64BYTES_WITH_PREFETCH_INSTRUCTION
> $ for i in `seq 3`; do ./a.out; done
> 0.715883
> 0.712118
> 0.711962
> 
> The test is very simple. I just run the in_cksumdata() function on one
> million packets. And the result is the time spent on calculating these
> checksums.
> 
> As we can see from the results, when reading 64 bytes data without manual
> prefetch operation in each loop, the speed is fastest. So, I think read
> a whole cache line in each loop is helpful.
> 
> ---
> 
> The computer that I run the test program on:
> 
> $ dmesg | grep CPU:
> CPU: Intel(R) Core(TM) i5-2400 CPU @ 3.10GHz (3093.03-MHz K8-class CPU)
> 
> ---
> 
> The test program:
> 
> #include <stdio.h>
> #include <sys/types.h>
> #include <sys/time.h>
> 
> /* ------------------------------------------------------------------------ */
> 
> #define PACKET_SIZE 1500
> #define BUFFER_SIZE ((PACKET_SIZE) << 20)
> static unsigned char buffer[BUFFER_SIZE];
> 
> /* ------------------------------------------------------------------------ */
> 
> /*
>  * Checksum routine for Internet Protocol family headers
>  *    (Portable Alpha version).
>  *
>  * This routine is very heavily used in the network
>  * code and should be modified for each CPU to be as fast as possible.
>  */
> 
> #define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
> #define REDUCE32							  \
>     {									  \
> 	q_util.q = sum;							  \
> 	sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3];	  \
>     }
> #define REDUCE16							  \
>     {									  \
> 	q_util.q = sum;							  \
> 	l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
> 	sum = l_util.s[0] + l_util.s[1];				  \
> 	ADDCARRY(sum);							  \
>     }
> 
> static const u_int32_t in_masks[] = {
> 	/*0 bytes*/ /*1 byte*/	/*2 bytes*/ /*3 bytes*/
> 	0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF,	/* offset 0 */
> 	0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00,	/* offset 1 */
> 	0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000,	/* offset 2 */
> 	0x00000000, 0xFF000000, 0xFF000000, 0xFF000000,	/* offset 3 */
> };
> 
> union l_util {
> 	u_int16_t s[2];
> 	u_int32_t l;
> };
> union q_util {
> 	u_int16_t s[4];
> 	u_int32_t l[2];
> 	u_int64_t q;
> };
> 
> /* ------------------------------------------------------------------------ */
> 
> //#define _32BYTES_WITH_PRE_READ
> //#define _64BYTES_WITH_PRE_READ
> //#define _32BYTES_WITHOUT_MANUAL_PREFETCH
> //#define _64BYTES_WITHOUT_MANUAL_PREFETCH
> //#define _64BYTES_WITH_PREFETCH_INSTRUCTION
> 
> #ifdef _32BYTES_WITH_PRE_READ
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> 	const u_int32_t *lw = (const u_int32_t *) buf;
> 	u_int64_t sum = 0;
> 	u_int64_t prefilled;
> 	int offset;
> 	union q_util q_util;
> 
> 	if ((3 & (long) lw) == 0 && len == 20) {
> 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> 	     REDUCE32;
> 	     return sum;
> 	}
> 
> 	if ((offset = 3 & (long) lw) != 0) {
> 		const u_int32_t *masks = in_masks + (offset << 2);
> 		lw = (u_int32_t *) (((long) lw) - offset);
> 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> 		len -= 4 - offset;
> 		if (len <= 0) {
> 			REDUCE32;
> 			return sum;
> 		}
> 	}
> #if 0
> 	/*
> 	 * Force to cache line boundary.
> 	 */
> 	offset = 32 - (0x1f & (long) lw);
> 	if (offset < 32 && len > offset) {
> 		len -= offset;
> 		if (4 & offset) {
> 			sum += (u_int64_t) lw[0];
> 			lw += 1;
> 		}
> 		if (8 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1];
> 			lw += 2;
> 		}
> 		if (16 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 			lw += 4;
> 		}
> 	}
> #endif
> 	/*
> 	 * access prefilling to start load of next cache line.
> 	 * then add current cache line
> 	 * save result of prefilling for loop iteration.
> 	 */
> 	prefilled = lw[0];
> 	while ((len -= 32) >= 4) {
> 		u_int64_t prefilling = lw[8];
> 		sum += prefilled + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7];
> 		lw += 8;
> 		prefilled = prefilling;
> 	}
> 	if (len >= 0) {
> 		sum += prefilled + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7];
> 		lw += 8;
> 	} else {
> 		len += 32;
> 	}
> 	while ((len -= 16) >= 0) {
> 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 		lw += 4;
> 	}
> 	len += 16;
> 	while ((len -= 4) >= 0) {
> 		sum += (u_int64_t) *lw++;
> 	}
> 	len += 4;
> 	if (len > 0)
> 		sum += (u_int64_t) (in_masks[len] & *lw);
> 	REDUCE32;
> 	return sum;
> }
> #endif
> 
> #ifdef _64BYTES_WITH_PRE_READ
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> 	const u_int32_t *lw = (const u_int32_t *) buf;
> 	u_int64_t sum = 0;
> 	u_int64_t prefilled;
> 	int offset;
> 	union q_util q_util;
> 
> 	if ((3 & (long) lw) == 0 && len == 20) {
> 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> 	     REDUCE32;
> 	     return sum;
> 	}
> 
> 	if ((offset = 3 & (long) lw) != 0) {
> 		const u_int32_t *masks = in_masks + (offset << 2);
> 		lw = (u_int32_t *) (((long) lw) - offset);
> 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> 		len -= 4 - offset;
> 		if (len <= 0) {
> 			REDUCE32;
> 			return sum;
> 		}
> 	}
> #if 0
> 	/*
> 	 * Force to cache line boundary.
> 	 */
> 	offset = 32 - (0x1f & (long) lw);
> 	if (offset < 32 && len > offset) {
> 		len -= offset;
> 		if (4 & offset) {
> 			sum += (u_int64_t) lw[0];
> 			lw += 1;
> 		}
> 		if (8 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1];
> 			lw += 2;
> 		}
> 		if (16 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 			lw += 4;
> 		}
> 	}
> #endif
> 	/*
> 	 * access prefilling to start load of next cache line.
> 	 * then add current cache line
> 	 * save result of prefilling for loop iteration.
> 	 */
> 	prefilled = lw[0];
> 	while ((len -= 64) >= 4) {
> 		u_int64_t prefilling = lw[16];
> 		sum += prefilled + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7]
> 			+ lw[8] + lw[9] + lw[10] + lw[11]
> 			+ lw[12] + lw[13] + lw[14] + lw[15];
> 		lw += 16;
> 		prefilled = prefilling;
> 	}
> 	if (len >= 0) {
> 		sum += prefilled + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7]
> 			+ lw[8] + lw[9] + lw[10] + lw[11]
> 			+ lw[12] + lw[13] + lw[14] + lw[15];
> 		lw += 16;
> 	} else {
> 		len += 64;
> 	}
> 	while ((len -= 16) >= 0) {
> 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 		lw += 4;
> 	}
> 	len += 16;
> 	while ((len -= 4) >= 0) {
> 		sum += (u_int64_t) *lw++;
> 	}
> 	len += 4;
> 	if (len > 0)
> 		sum += (u_int64_t) (in_masks[len] & *lw);
> 	REDUCE32;
> 	return sum;
> }
> #endif
> 
> #ifdef _32BYTES_WITHOUT_MANUAL_PREFETCH
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> 	const u_int32_t *lw = (const u_int32_t *) buf;
> 	u_int64_t sum = 0;
> 	int offset;
> 	union q_util q_util;
> 
> 	if ((3 & (long) lw) == 0 && len == 20) {
> 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> 	     REDUCE32;
> 	     return sum;
> 	}
> 
> 	if ((offset = 3 & (long) lw) != 0) {
> 		const u_int32_t *masks = in_masks + (offset << 2);
> 		lw = (u_int32_t *) (((long) lw) - offset);
> 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> 		len -= 4 - offset;
> 		if (len <= 0) {
> 			REDUCE32;
> 			return sum;
> 		}
> 	}
> #if 0
> 	/*
> 	 * Force to cache line boundary.
> 	 */
> 	offset = 32 - (0x1f & (long) lw);
> 	if (offset < 32 && len > offset) {
> 		len -= offset;
> 		if (4 & offset) {
> 			sum += (u_int64_t) lw[0];
> 			lw += 1;
> 		}
> 		if (8 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1];
> 			lw += 2;
> 		}
> 		if (16 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 			lw += 4;
> 		}
> 	}
> #endif
> 	/*
> 	 * access prefilling to start load of next cache line.
> 	 * then add current cache line
> 	 * save result of prefilling for loop iteration.
> 	 */
> 	while ((len -= 32) >= 4) {
> 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7];
> 		lw += 8;
> 	}
> 	if (len >= 0) {
> 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7];
> 		lw += 8;
> 	} else {
> 		len += 32;
> 	}
> 	while ((len -= 16) >= 0) {
> 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 		lw += 4;
> 	}
> 	len += 16;
> 	while ((len -= 4) >= 0) {
> 		sum += (u_int64_t) *lw++;
> 	}
> 	len += 4;
> 	if (len > 0)
> 		sum += (u_int64_t) (in_masks[len] & *lw);
> 	REDUCE32;
> 	return sum;
> }
> #endif
> 
> #ifdef _64BYTES_WITHOUT_MANUAL_PREFETCH
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> 	const u_int32_t *lw = (const u_int32_t *) buf;
> 	u_int64_t sum = 0;
> 	int offset;
> 	union q_util q_util;
> 
> 	if ((3 & (long) lw) == 0 && len == 20) {
> 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> 	     REDUCE32;
> 	     return sum;
> 	}
> 
> 	if ((offset = 3 & (long) lw) != 0) {
> 		const u_int32_t *masks = in_masks + (offset << 2);
> 		lw = (u_int32_t *) (((long) lw) - offset);
> 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> 		len -= 4 - offset;
> 		if (len <= 0) {
> 			REDUCE32;
> 			return sum;
> 		}
> 	}
> #if 0
> 	/*
> 	 * Force to cache line boundary.
> 	 */
> 	offset = 32 - (0x1f & (long) lw);
> 	if (offset < 32 && len > offset) {
> 		len -= offset;
> 		if (4 & offset) {
> 			sum += (u_int64_t) lw[0];
> 			lw += 1;
> 		}
> 		if (8 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1];
> 			lw += 2;
> 		}
> 		if (16 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 			lw += 4;
> 		}
> 	}
> #endif
> 	/*
> 	 * access prefilling to start load of next cache line.
> 	 * then add current cache line
> 	 * save result of prefilling for loop iteration.
> 	 */
> 	while ((len -= 64) >= 4) {
> 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7]
> 			+ lw[8] + lw[9] + lw[10] + lw[11]
> 			+ lw[12] + lw[13] + lw[14] + lw[15];
> 		lw += 16;
> 	}
> 	if (len >= 0) {
> 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7]
> 			+ lw[8] + lw[9] + lw[10] + lw[11]
> 			+ lw[12] + lw[13] + lw[14] + lw[15];
> 		lw += 16;
> 	} else {
> 		len += 64;
> 	}
> 	while ((len -= 16) >= 0) {
> 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 		lw += 4;
> 	}
> 	len += 16;
> 	while ((len -= 4) >= 0) {
> 		sum += (u_int64_t) *lw++;
> 	}
> 	len += 4;
> 	if (len > 0)
> 		sum += (u_int64_t) (in_masks[len] & *lw);
> 	REDUCE32;
> 	return sum;
> }
> #endif
> 
> #ifdef _64BYTES_WITH_PREFETCH_INSTRUCTION
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> 	const u_int32_t *lw = (const u_int32_t *) buf;
> 	u_int64_t sum = 0;
> 	int offset;
> 	union q_util q_util;
> 
> 	if ((3 & (long) lw) == 0 && len == 20) {
> 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> 	     REDUCE32;
> 	     return sum;
> 	}
> 
> 	if ((offset = 3 & (long) lw) != 0) {
> 		const u_int32_t *masks = in_masks + (offset << 2);
> 		lw = (u_int32_t *) (((long) lw) - offset);
> 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> 		len -= 4 - offset;
> 		if (len <= 0) {
> 			REDUCE32;
> 			return sum;
> 		}
> 	}
> #if 0
> 	/*
> 	 * Force to cache line boundary.
> 	 */
> 	offset = 32 - (0x1f & (long) lw);
> 	if (offset < 32 && len > offset) {
> 		len -= offset;
> 		if (4 & offset) {
> 			sum += (u_int64_t) lw[0];
> 			lw += 1;
> 		}
> 		if (8 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1];
> 			lw += 2;
> 		}
> 		if (16 & offset) {
> 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 			lw += 4;
> 		}
> 	}
> #endif
> 	/*
> 	 * access prefilling to start load of next cache line.
> 	 * then add current cache line
> 	 * save result of prefilling for loop iteration.
> 	 */
> 	__builtin_prefetch(&lw[0]);
> 	while ((len -= 64) >= 4) {
> 		__builtin_prefetch(&lw[16]);
> 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7]
> 			+ lw[8] + lw[9] + lw[10] + lw[11]
> 			+ lw[12] + lw[13] + lw[14] + lw[15];
> 		lw += 16;
> 	}
> 	if (len >= 0) {
> 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> 			+ lw[4] + lw[5] + lw[6] + lw[7]
> 			+ lw[8] + lw[9] + lw[10] + lw[11]
> 			+ lw[12] + lw[13] + lw[14] + lw[15];
> 		lw += 16;
> 	} else {
> 		len += 64;
> 	}
> 	while ((len -= 16) >= 0) {
> 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> 		lw += 4;
> 	}
> 	len += 16;
> 	while ((len -= 4) >= 0) {
> 		sum += (u_int64_t) *lw++;
> 	}
> 	len += 4;
> 	if (len > 0)
> 		sum += (u_int64_t) (in_masks[len] & *lw);
> 	REDUCE32;
> 	return sum;
> }
> #endif
> 
> /* ------------------------------------------------------------------------ */
> 
> int main(void)
> {
> 	int i;
> 	int sum;
> 	struct timeval tv1, tv2, res;
> 
> 	gettimeofday(&tv1, NULL);
> 	for (i = 0; i < BUFFER_SIZE; i += PACKET_SIZE)
> 		sum = in_cksumdata(&buffer[i], PACKET_SIZE);
> 	gettimeofday(&tv2, NULL);
> 
> 	timersub(&tv2, &tv1, &res);
> 	printf("%ld.%6ld\n", res.tv_sec, res.tv_usec);
> 
> 	return (0);
> }
>