Fwd: Questions with the in_cksumdata() function in sys/amd64/amd64/in_cksum.c

Mon Oct 20 05:48:18 UTC 2014

> I would be not surprised if this manual prefetching by explicit reads
> causes slowdown of the function.  I suspect it could confuse hardware
> prefetcher by breaking the linear pattern, or the patch could break
> the logic of the limited forward-looking oracle by reading too far
> from the current linear read tip.
> 
> Also, it could confuse the data flow engine if the register allocator
> is unable to see that the read value is needed not right now, and cause
> unneeded stall while next cache line is fetched.
> 
> Sure, all my speculations are pure garbage until confirmed by
> measurements with pmc, but I think that the patch below must be
> benchmarked to confirm any value it provides as well. My opinion is,
> we should either remove the manual prefetch, or do it with PREFETCHLX
> instructions only, instead of real read.

I have done a rather simple test. And the results are listed as follows:

#1. Read 32 bytes with manual pre-read in each loop:

$ cc main.c -D_32BYTES_WITH_PRE_READ
$ for i in `seq 3`; do ./a.out; done
0.768854
0.770332
0.773803

#2. Read 64 bytes with manual pre-read in each loop:

$ cc main.c -D_64BYTES_WITH_PRE_READ
$ for i in `seq 3`; do ./a.out; done
0.702416
0.703648
0.704498

#3. Read 32 bytes without manual prefetch in each loop:

$ cc main.c -D_32BYTES_WITHOUT_MANUAL_PREFETCH
$ for i in `seq 3`; do ./a.out; done
0.726651
0.728841
0.725956

#4. Read 64 bytes without manual prefetch in each loop:

$ cc main.c -D_64BYTES_WITHOUT_MANUAL_PREFETCH
$ for i in `seq 3`; do ./a.out; done
0.698071
0.697971
0.698314

#5. Read 64 bytes with PREFETCH instruction:

$ cc main.c -D_64BYTES_WITH_PREFETCH_INSTRUCTION
$ for i in `seq 3`; do ./a.out; done
0.715883
0.712118
0.711962

The test is very simple. I just run the in_cksumdata() function on one
million packets. And the result is the time spent on calculating these
checksums.

As we can see from the results, when reading 64 bytes data without manual
prefetch operation in each loop, the speed is fastest. So, I think read
a whole cache line in each loop is helpful.

---

The computer that I run the test program on:

$ dmesg | grep CPU:
CPU: Intel(R) Core(TM) i5-2400 CPU @ 3.10GHz (3093.03-MHz K8-class CPU)

---

The test program:

#include <stdio.h>
#include <sys/types.h>
#include <sys/time.h>

/* ------------------------------------------------------------------------ */

#define PACKET_SIZE 1500
#define BUFFER_SIZE ((PACKET_SIZE) << 20)
static unsigned char buffer[BUFFER_SIZE];

/* ------------------------------------------------------------------------ */

/*
 * Checksum routine for Internet Protocol family headers
 *    (Portable Alpha version).
 *
 * This routine is very heavily used in the network
 * code and should be modified for each CPU to be as fast as possible.
 */

#define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
#define REDUCE32							  \
    {									  \
	q_util.q = sum;							  \
	sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3];	  \
    }
#define REDUCE16							  \
    {									  \
	q_util.q = sum;							  \
	l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
	sum = l_util.s[0] + l_util.s[1];				  \
	ADDCARRY(sum);							  \
    }

static const u_int32_t in_masks[] = {
	/*0 bytes*/ /*1 byte*/	/*2 bytes*/ /*3 bytes*/
	0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF,	/* offset 0 */
	0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00,	/* offset 1 */
	0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000,	/* offset 2 */
	0x00000000, 0xFF000000, 0xFF000000, 0xFF000000,	/* offset 3 */
};

union l_util {
	u_int16_t s[2];
	u_int32_t l;
};
union q_util {
	u_int16_t s[4];
	u_int32_t l[2];
	u_int64_t q;
};

/* ------------------------------------------------------------------------ */

//#define _32BYTES_WITH_PRE_READ
//#define _64BYTES_WITH_PRE_READ
//#define _32BYTES_WITHOUT_MANUAL_PREFETCH
//#define _64BYTES_WITHOUT_MANUAL_PREFETCH
//#define _64BYTES_WITH_PREFETCH_INSTRUCTION

#ifdef _32BYTES_WITH_PRE_READ
static u_int64_t
in_cksumdata(const void *buf, int len)
{
	const u_int32_t *lw = (const u_int32_t *) buf;
	u_int64_t sum = 0;
	u_int64_t prefilled;
	int offset;
	union q_util q_util;

	if ((3 & (long) lw) == 0 && len == 20) {
	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
	     REDUCE32;
	     return sum;
	}

	if ((offset = 3 & (long) lw) != 0) {
		const u_int32_t *masks = in_masks + (offset << 2);
		lw = (u_int32_t *) (((long) lw) - offset);
		sum = *lw++ & masks[len >= 3 ? 3 : len];
		len -= 4 - offset;
		if (len <= 0) {
			REDUCE32;
			return sum;
		}
	}
#if 0
	/*
	 * Force to cache line boundary.
	 */
	offset = 32 - (0x1f & (long) lw);
	if (offset < 32 && len > offset) {
		len -= offset;
		if (4 & offset) {
			sum += (u_int64_t) lw[0];
			lw += 1;
		}
		if (8 & offset) {
			sum += (u_int64_t) lw[0] + lw[1];
			lw += 2;
		}
		if (16 & offset) {
			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
			lw += 4;
		}
	}
#endif
	/*
	 * access prefilling to start load of next cache line.
	 * then add current cache line
	 * save result of prefilling for loop iteration.
	 */
	prefilled = lw[0];
	while ((len -= 32) >= 4) {
		u_int64_t prefilling = lw[8];
		sum += prefilled + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7];
		lw += 8;
		prefilled = prefilling;
	}
	if (len >= 0) {
		sum += prefilled + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7];
		lw += 8;
	} else {
		len += 32;
	}
	while ((len -= 16) >= 0) {
		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
		lw += 4;
	}
	len += 16;
	while ((len -= 4) >= 0) {
		sum += (u_int64_t) *lw++;
	}
	len += 4;
	if (len > 0)
		sum += (u_int64_t) (in_masks[len] & *lw);
	REDUCE32;
	return sum;
}
#endif

#ifdef _64BYTES_WITH_PRE_READ
static u_int64_t
in_cksumdata(const void *buf, int len)
{
	const u_int32_t *lw = (const u_int32_t *) buf;
	u_int64_t sum = 0;
	u_int64_t prefilled;
	int offset;
	union q_util q_util;

	if ((3 & (long) lw) == 0 && len == 20) {
	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
	     REDUCE32;
	     return sum;
	}

	if ((offset = 3 & (long) lw) != 0) {
		const u_int32_t *masks = in_masks + (offset << 2);
		lw = (u_int32_t *) (((long) lw) - offset);
		sum = *lw++ & masks[len >= 3 ? 3 : len];
		len -= 4 - offset;
		if (len <= 0) {
			REDUCE32;
			return sum;
		}
	}
#if 0
	/*
	 * Force to cache line boundary.
	 */
	offset = 32 - (0x1f & (long) lw);
	if (offset < 32 && len > offset) {
		len -= offset;
		if (4 & offset) {
			sum += (u_int64_t) lw[0];
			lw += 1;
		}
		if (8 & offset) {
			sum += (u_int64_t) lw[0] + lw[1];
			lw += 2;
		}
		if (16 & offset) {
			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
			lw += 4;
		}
	}
#endif
	/*
	 * access prefilling to start load of next cache line.
	 * then add current cache line
	 * save result of prefilling for loop iteration.
	 */
	prefilled = lw[0];
	while ((len -= 64) >= 4) {
		u_int64_t prefilling = lw[16];
		sum += prefilled + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7]
			+ lw[8] + lw[9] + lw[10] + lw[11]
			+ lw[12] + lw[13] + lw[14] + lw[15];
		lw += 16;
		prefilled = prefilling;
	}
	if (len >= 0) {
		sum += prefilled + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7]
			+ lw[8] + lw[9] + lw[10] + lw[11]
			+ lw[12] + lw[13] + lw[14] + lw[15];
		lw += 16;
	} else {
		len += 64;
	}
	while ((len -= 16) >= 0) {
		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
		lw += 4;
	}
	len += 16;
	while ((len -= 4) >= 0) {
		sum += (u_int64_t) *lw++;
	}
	len += 4;
	if (len > 0)
		sum += (u_int64_t) (in_masks[len] & *lw);
	REDUCE32;
	return sum;
}
#endif

#ifdef _32BYTES_WITHOUT_MANUAL_PREFETCH
static u_int64_t
in_cksumdata(const void *buf, int len)
{
	const u_int32_t *lw = (const u_int32_t *) buf;
	u_int64_t sum = 0;
	int offset;
	union q_util q_util;

	if ((3 & (long) lw) == 0 && len == 20) {
	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
	     REDUCE32;
	     return sum;
	}

	if ((offset = 3 & (long) lw) != 0) {
		const u_int32_t *masks = in_masks + (offset << 2);
		lw = (u_int32_t *) (((long) lw) - offset);
		sum = *lw++ & masks[len >= 3 ? 3 : len];
		len -= 4 - offset;
		if (len <= 0) {
			REDUCE32;
			return sum;
		}
	}
#if 0
	/*
	 * Force to cache line boundary.
	 */
	offset = 32 - (0x1f & (long) lw);
	if (offset < 32 && len > offset) {
		len -= offset;
		if (4 & offset) {
			sum += (u_int64_t) lw[0];
			lw += 1;
		}
		if (8 & offset) {
			sum += (u_int64_t) lw[0] + lw[1];
			lw += 2;
		}
		if (16 & offset) {
			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
			lw += 4;
		}
	}
#endif
	/*
	 * access prefilling to start load of next cache line.
	 * then add current cache line
	 * save result of prefilling for loop iteration.
	 */
	while ((len -= 32) >= 4) {
		sum += lw[0] + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7];
		lw += 8;
	}
	if (len >= 0) {
		sum += lw[0] + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7];
		lw += 8;
	} else {
		len += 32;
	}
	while ((len -= 16) >= 0) {
		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
		lw += 4;
	}
	len += 16;
	while ((len -= 4) >= 0) {
		sum += (u_int64_t) *lw++;
	}
	len += 4;
	if (len > 0)
		sum += (u_int64_t) (in_masks[len] & *lw);
	REDUCE32;
	return sum;
}
#endif

#ifdef _64BYTES_WITHOUT_MANUAL_PREFETCH
static u_int64_t
in_cksumdata(const void *buf, int len)
{
	const u_int32_t *lw = (const u_int32_t *) buf;
	u_int64_t sum = 0;
	int offset;
	union q_util q_util;

	if ((3 & (long) lw) == 0 && len == 20) {
	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
	     REDUCE32;
	     return sum;
	}

	if ((offset = 3 & (long) lw) != 0) {
		const u_int32_t *masks = in_masks + (offset << 2);
		lw = (u_int32_t *) (((long) lw) - offset);
		sum = *lw++ & masks[len >= 3 ? 3 : len];
		len -= 4 - offset;
		if (len <= 0) {
			REDUCE32;
			return sum;
		}
	}
#if 0
	/*
	 * Force to cache line boundary.
	 */
	offset = 32 - (0x1f & (long) lw);
	if (offset < 32 && len > offset) {
		len -= offset;
		if (4 & offset) {
			sum += (u_int64_t) lw[0];
			lw += 1;
		}
		if (8 & offset) {
			sum += (u_int64_t) lw[0] + lw[1];
			lw += 2;
		}
		if (16 & offset) {
			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
			lw += 4;
		}
	}
#endif
	/*
	 * access prefilling to start load of next cache line.
	 * then add current cache line
	 * save result of prefilling for loop iteration.
	 */
	while ((len -= 64) >= 4) {
		sum += lw[0] + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7]
			+ lw[8] + lw[9] + lw[10] + lw[11]
			+ lw[12] + lw[13] + lw[14] + lw[15];
		lw += 16;
	}
	if (len >= 0) {
		sum += lw[0] + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7]
			+ lw[8] + lw[9] + lw[10] + lw[11]
			+ lw[12] + lw[13] + lw[14] + lw[15];
		lw += 16;
	} else {
		len += 64;
	}
	while ((len -= 16) >= 0) {
		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
		lw += 4;
	}
	len += 16;
	while ((len -= 4) >= 0) {
		sum += (u_int64_t) *lw++;
	}
	len += 4;
	if (len > 0)
		sum += (u_int64_t) (in_masks[len] & *lw);
	REDUCE32;
	return sum;
}
#endif

#ifdef _64BYTES_WITH_PREFETCH_INSTRUCTION
static u_int64_t
in_cksumdata(const void *buf, int len)
{
	const u_int32_t *lw = (const u_int32_t *) buf;
	u_int64_t sum = 0;
	int offset;
	union q_util q_util;

	if ((3 & (long) lw) == 0 && len == 20) {
	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
	     REDUCE32;
	     return sum;
	}

	if ((offset = 3 & (long) lw) != 0) {
		const u_int32_t *masks = in_masks + (offset << 2);
		lw = (u_int32_t *) (((long) lw) - offset);
		sum = *lw++ & masks[len >= 3 ? 3 : len];
		len -= 4 - offset;
		if (len <= 0) {
			REDUCE32;
			return sum;
		}
	}
#if 0
	/*
	 * Force to cache line boundary.
	 */
	offset = 32 - (0x1f & (long) lw);
	if (offset < 32 && len > offset) {
		len -= offset;
		if (4 & offset) {
			sum += (u_int64_t) lw[0];
			lw += 1;
		}
		if (8 & offset) {
			sum += (u_int64_t) lw[0] + lw[1];
			lw += 2;
		}
		if (16 & offset) {
			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
			lw += 4;
		}
	}
#endif
	/*
	 * access prefilling to start load of next cache line.
	 * then add current cache line
	 * save result of prefilling for loop iteration.
	 */
	__builtin_prefetch(&lw[0]);
	while ((len -= 64) >= 4) {
		__builtin_prefetch(&lw[16]);
		sum += lw[0] + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7]
			+ lw[8] + lw[9] + lw[10] + lw[11]
			+ lw[12] + lw[13] + lw[14] + lw[15];
		lw += 16;
	}
	if (len >= 0) {
		sum += lw[0] + lw[1] + lw[2] + lw[3]
			+ lw[4] + lw[5] + lw[6] + lw[7]
			+ lw[8] + lw[9] + lw[10] + lw[11]
			+ lw[12] + lw[13] + lw[14] + lw[15];
		lw += 16;
	} else {
		len += 64;
	}
	while ((len -= 16) >= 0) {
		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
		lw += 4;
	}
	len += 16;
	while ((len -= 4) >= 0) {
		sum += (u_int64_t) *lw++;
	}
	len += 4;
	if (len > 0)
		sum += (u_int64_t) (in_masks[len] & *lw);
	REDUCE32;
	return sum;
}
#endif

/* ------------------------------------------------------------------------ */

int main(void)
{
	int i;
	int sum;
	struct timeval tv1, tv2, res;

	gettimeofday(&tv1, NULL);
	for (i = 0; i < BUFFER_SIZE; i += PACKET_SIZE)
		sum = in_cksumdata(&buffer[i], PACKET_SIZE);
	gettimeofday(&tv2, NULL);

	timersub(&tv2, &tv1, &res);
	printf("%ld.%6ld\n", res.tv_sec, res.tv_usec);

	return (0);
}