Fwd: Questions with the in_cksumdata() function in sys/amd64/amd64/in_cksum.c
Konstantin Belousov
kostikbel at gmail.com
Mon Oct 20 08:34:02 UTC 2014
On Mon, Oct 20, 2014 at 01:47:55PM +0800, Tiwei Bie wrote:
> > I would be not surprised if this manual prefetching by explicit reads
> > causes slowdown of the function. I suspect it could confuse hardware
> > prefetcher by breaking the linear pattern, or the patch could break
> > the logic of the limited forward-looking oracle by reading too far
> > from the current linear read tip.
> >
> > Also, it could confuse the data flow engine if the register allocator
> > is unable to see that the read value is needed not right now, and cause
> > unneeded stall while next cache line is fetched.
> >
> > Sure, all my speculations are pure garbage until confirmed by
> > measurements with pmc, but I think that the patch below must be
> > benchmarked to confirm any value it provides as well. My opinion is,
> > we should either remove the manual prefetch, or do it with PREFETCHLX
> > instructions only, instead of real read.
>
> I have done a rather simple test. And the results are listed as follows:
>
Yes, too simple to draw conclusion, IMO.
Please look at the ministat(1). I think that the test run length
is too short to come with any decisions. The length x 3 runs does not
give enough confidence; but ministat would provide the numbers to judge.
> #1. Read 32 bytes with manual pre-read in each loop:
>
> $ cc main.c -D_32BYTES_WITH_PRE_READ
> $ for i in `seq 3`; do ./a.out; done
> 0.768854
> 0.770332
> 0.773803
>
> #2. Read 64 bytes with manual pre-read in each loop:
>
> $ cc main.c -D_64BYTES_WITH_PRE_READ
> $ for i in `seq 3`; do ./a.out; done
> 0.702416
> 0.703648
> 0.704498
>
> #3. Read 32 bytes without manual prefetch in each loop:
>
> $ cc main.c -D_32BYTES_WITHOUT_MANUAL_PREFETCH
> $ for i in `seq 3`; do ./a.out; done
> 0.726651
> 0.728841
> 0.725956
>
> #4. Read 64 bytes without manual prefetch in each loop:
>
> $ cc main.c -D_64BYTES_WITHOUT_MANUAL_PREFETCH
> $ for i in `seq 3`; do ./a.out; done
> 0.698071
> 0.697971
> 0.698314
>
> #5. Read 64 bytes with PREFETCH instruction:
>
> $ cc main.c -D_64BYTES_WITH_PREFETCH_INSTRUCTION
> $ for i in `seq 3`; do ./a.out; done
> 0.715883
> 0.712118
> 0.711962
>
> The test is very simple. I just run the in_cksumdata() function on one
> million packets. And the result is the time spent on calculating these
> checksums.
>
> As we can see from the results, when reading 64 bytes data without manual
> prefetch operation in each loop, the speed is fastest. So, I think read
> a whole cache line in each loop is helpful.
>
> ---
>
> The computer that I run the test program on:
>
> $ dmesg | grep CPU:
> CPU: Intel(R) Core(TM) i5-2400 CPU @ 3.10GHz (3093.03-MHz K8-class CPU)
>
> ---
>
> The test program:
>
> #include <stdio.h>
> #include <sys/types.h>
> #include <sys/time.h>
>
> /* ------------------------------------------------------------------------ */
>
> #define PACKET_SIZE 1500
> #define BUFFER_SIZE ((PACKET_SIZE) << 20)
> static unsigned char buffer[BUFFER_SIZE];
>
> /* ------------------------------------------------------------------------ */
>
> /*
> * Checksum routine for Internet Protocol family headers
> * (Portable Alpha version).
> *
> * This routine is very heavily used in the network
> * code and should be modified for each CPU to be as fast as possible.
> */
>
> #define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
> #define REDUCE32 \
> { \
> q_util.q = sum; \
> sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
> }
> #define REDUCE16 \
> { \
> q_util.q = sum; \
> l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
> sum = l_util.s[0] + l_util.s[1]; \
> ADDCARRY(sum); \
> }
>
> static const u_int32_t in_masks[] = {
> /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/
> 0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */
> 0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */
> 0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */
> 0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */
> };
>
> union l_util {
> u_int16_t s[2];
> u_int32_t l;
> };
> union q_util {
> u_int16_t s[4];
> u_int32_t l[2];
> u_int64_t q;
> };
>
> /* ------------------------------------------------------------------------ */
>
> //#define _32BYTES_WITH_PRE_READ
> //#define _64BYTES_WITH_PRE_READ
> //#define _32BYTES_WITHOUT_MANUAL_PREFETCH
> //#define _64BYTES_WITHOUT_MANUAL_PREFETCH
> //#define _64BYTES_WITH_PREFETCH_INSTRUCTION
>
> #ifdef _32BYTES_WITH_PRE_READ
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> const u_int32_t *lw = (const u_int32_t *) buf;
> u_int64_t sum = 0;
> u_int64_t prefilled;
> int offset;
> union q_util q_util;
>
> if ((3 & (long) lw) == 0 && len == 20) {
> sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> REDUCE32;
> return sum;
> }
>
> if ((offset = 3 & (long) lw) != 0) {
> const u_int32_t *masks = in_masks + (offset << 2);
> lw = (u_int32_t *) (((long) lw) - offset);
> sum = *lw++ & masks[len >= 3 ? 3 : len];
> len -= 4 - offset;
> if (len <= 0) {
> REDUCE32;
> return sum;
> }
> }
> #if 0
> /*
> * Force to cache line boundary.
> */
> offset = 32 - (0x1f & (long) lw);
> if (offset < 32 && len > offset) {
> len -= offset;
> if (4 & offset) {
> sum += (u_int64_t) lw[0];
> lw += 1;
> }
> if (8 & offset) {
> sum += (u_int64_t) lw[0] + lw[1];
> lw += 2;
> }
> if (16 & offset) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> }
> #endif
> /*
> * access prefilling to start load of next cache line.
> * then add current cache line
> * save result of prefilling for loop iteration.
> */
> prefilled = lw[0];
> while ((len -= 32) >= 4) {
> u_int64_t prefilling = lw[8];
> sum += prefilled + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7];
> lw += 8;
> prefilled = prefilling;
> }
> if (len >= 0) {
> sum += prefilled + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7];
> lw += 8;
> } else {
> len += 32;
> }
> while ((len -= 16) >= 0) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> len += 16;
> while ((len -= 4) >= 0) {
> sum += (u_int64_t) *lw++;
> }
> len += 4;
> if (len > 0)
> sum += (u_int64_t) (in_masks[len] & *lw);
> REDUCE32;
> return sum;
> }
> #endif
>
> #ifdef _64BYTES_WITH_PRE_READ
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> const u_int32_t *lw = (const u_int32_t *) buf;
> u_int64_t sum = 0;
> u_int64_t prefilled;
> int offset;
> union q_util q_util;
>
> if ((3 & (long) lw) == 0 && len == 20) {
> sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> REDUCE32;
> return sum;
> }
>
> if ((offset = 3 & (long) lw) != 0) {
> const u_int32_t *masks = in_masks + (offset << 2);
> lw = (u_int32_t *) (((long) lw) - offset);
> sum = *lw++ & masks[len >= 3 ? 3 : len];
> len -= 4 - offset;
> if (len <= 0) {
> REDUCE32;
> return sum;
> }
> }
> #if 0
> /*
> * Force to cache line boundary.
> */
> offset = 32 - (0x1f & (long) lw);
> if (offset < 32 && len > offset) {
> len -= offset;
> if (4 & offset) {
> sum += (u_int64_t) lw[0];
> lw += 1;
> }
> if (8 & offset) {
> sum += (u_int64_t) lw[0] + lw[1];
> lw += 2;
> }
> if (16 & offset) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> }
> #endif
> /*
> * access prefilling to start load of next cache line.
> * then add current cache line
> * save result of prefilling for loop iteration.
> */
> prefilled = lw[0];
> while ((len -= 64) >= 4) {
> u_int64_t prefilling = lw[16];
> sum += prefilled + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7]
> + lw[8] + lw[9] + lw[10] + lw[11]
> + lw[12] + lw[13] + lw[14] + lw[15];
> lw += 16;
> prefilled = prefilling;
> }
> if (len >= 0) {
> sum += prefilled + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7]
> + lw[8] + lw[9] + lw[10] + lw[11]
> + lw[12] + lw[13] + lw[14] + lw[15];
> lw += 16;
> } else {
> len += 64;
> }
> while ((len -= 16) >= 0) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> len += 16;
> while ((len -= 4) >= 0) {
> sum += (u_int64_t) *lw++;
> }
> len += 4;
> if (len > 0)
> sum += (u_int64_t) (in_masks[len] & *lw);
> REDUCE32;
> return sum;
> }
> #endif
>
> #ifdef _32BYTES_WITHOUT_MANUAL_PREFETCH
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> const u_int32_t *lw = (const u_int32_t *) buf;
> u_int64_t sum = 0;
> int offset;
> union q_util q_util;
>
> if ((3 & (long) lw) == 0 && len == 20) {
> sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> REDUCE32;
> return sum;
> }
>
> if ((offset = 3 & (long) lw) != 0) {
> const u_int32_t *masks = in_masks + (offset << 2);
> lw = (u_int32_t *) (((long) lw) - offset);
> sum = *lw++ & masks[len >= 3 ? 3 : len];
> len -= 4 - offset;
> if (len <= 0) {
> REDUCE32;
> return sum;
> }
> }
> #if 0
> /*
> * Force to cache line boundary.
> */
> offset = 32 - (0x1f & (long) lw);
> if (offset < 32 && len > offset) {
> len -= offset;
> if (4 & offset) {
> sum += (u_int64_t) lw[0];
> lw += 1;
> }
> if (8 & offset) {
> sum += (u_int64_t) lw[0] + lw[1];
> lw += 2;
> }
> if (16 & offset) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> }
> #endif
> /*
> * access prefilling to start load of next cache line.
> * then add current cache line
> * save result of prefilling for loop iteration.
> */
> while ((len -= 32) >= 4) {
> sum += lw[0] + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7];
> lw += 8;
> }
> if (len >= 0) {
> sum += lw[0] + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7];
> lw += 8;
> } else {
> len += 32;
> }
> while ((len -= 16) >= 0) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> len += 16;
> while ((len -= 4) >= 0) {
> sum += (u_int64_t) *lw++;
> }
> len += 4;
> if (len > 0)
> sum += (u_int64_t) (in_masks[len] & *lw);
> REDUCE32;
> return sum;
> }
> #endif
>
> #ifdef _64BYTES_WITHOUT_MANUAL_PREFETCH
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> const u_int32_t *lw = (const u_int32_t *) buf;
> u_int64_t sum = 0;
> int offset;
> union q_util q_util;
>
> if ((3 & (long) lw) == 0 && len == 20) {
> sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> REDUCE32;
> return sum;
> }
>
> if ((offset = 3 & (long) lw) != 0) {
> const u_int32_t *masks = in_masks + (offset << 2);
> lw = (u_int32_t *) (((long) lw) - offset);
> sum = *lw++ & masks[len >= 3 ? 3 : len];
> len -= 4 - offset;
> if (len <= 0) {
> REDUCE32;
> return sum;
> }
> }
> #if 0
> /*
> * Force to cache line boundary.
> */
> offset = 32 - (0x1f & (long) lw);
> if (offset < 32 && len > offset) {
> len -= offset;
> if (4 & offset) {
> sum += (u_int64_t) lw[0];
> lw += 1;
> }
> if (8 & offset) {
> sum += (u_int64_t) lw[0] + lw[1];
> lw += 2;
> }
> if (16 & offset) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> }
> #endif
> /*
> * access prefilling to start load of next cache line.
> * then add current cache line
> * save result of prefilling for loop iteration.
> */
> while ((len -= 64) >= 4) {
> sum += lw[0] + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7]
> + lw[8] + lw[9] + lw[10] + lw[11]
> + lw[12] + lw[13] + lw[14] + lw[15];
> lw += 16;
> }
> if (len >= 0) {
> sum += lw[0] + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7]
> + lw[8] + lw[9] + lw[10] + lw[11]
> + lw[12] + lw[13] + lw[14] + lw[15];
> lw += 16;
> } else {
> len += 64;
> }
> while ((len -= 16) >= 0) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> len += 16;
> while ((len -= 4) >= 0) {
> sum += (u_int64_t) *lw++;
> }
> len += 4;
> if (len > 0)
> sum += (u_int64_t) (in_masks[len] & *lw);
> REDUCE32;
> return sum;
> }
> #endif
>
> #ifdef _64BYTES_WITH_PREFETCH_INSTRUCTION
> static u_int64_t
> in_cksumdata(const void *buf, int len)
> {
> const u_int32_t *lw = (const u_int32_t *) buf;
> u_int64_t sum = 0;
> int offset;
> union q_util q_util;
>
> if ((3 & (long) lw) == 0 && len == 20) {
> sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> REDUCE32;
> return sum;
> }
>
> if ((offset = 3 & (long) lw) != 0) {
> const u_int32_t *masks = in_masks + (offset << 2);
> lw = (u_int32_t *) (((long) lw) - offset);
> sum = *lw++ & masks[len >= 3 ? 3 : len];
> len -= 4 - offset;
> if (len <= 0) {
> REDUCE32;
> return sum;
> }
> }
> #if 0
> /*
> * Force to cache line boundary.
> */
> offset = 32 - (0x1f & (long) lw);
> if (offset < 32 && len > offset) {
> len -= offset;
> if (4 & offset) {
> sum += (u_int64_t) lw[0];
> lw += 1;
> }
> if (8 & offset) {
> sum += (u_int64_t) lw[0] + lw[1];
> lw += 2;
> }
> if (16 & offset) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> }
> #endif
> /*
> * access prefilling to start load of next cache line.
> * then add current cache line
> * save result of prefilling for loop iteration.
> */
> __builtin_prefetch(&lw[0]);
> while ((len -= 64) >= 4) {
> __builtin_prefetch(&lw[16]);
> sum += lw[0] + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7]
> + lw[8] + lw[9] + lw[10] + lw[11]
> + lw[12] + lw[13] + lw[14] + lw[15];
> lw += 16;
> }
> if (len >= 0) {
> sum += lw[0] + lw[1] + lw[2] + lw[3]
> + lw[4] + lw[5] + lw[6] + lw[7]
> + lw[8] + lw[9] + lw[10] + lw[11]
> + lw[12] + lw[13] + lw[14] + lw[15];
> lw += 16;
> } else {
> len += 64;
> }
> while ((len -= 16) >= 0) {
> sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> lw += 4;
> }
> len += 16;
> while ((len -= 4) >= 0) {
> sum += (u_int64_t) *lw++;
> }
> len += 4;
> if (len > 0)
> sum += (u_int64_t) (in_masks[len] & *lw);
> REDUCE32;
> return sum;
> }
> #endif
>
> /* ------------------------------------------------------------------------ */
>
> int main(void)
> {
> int i;
> int sum;
> struct timeval tv1, tv2, res;
>
> gettimeofday(&tv1, NULL);
> for (i = 0; i < BUFFER_SIZE; i += PACKET_SIZE)
> sum = in_cksumdata(&buffer[i], PACKET_SIZE);
> gettimeofday(&tv2, NULL);
>
> timersub(&tv2, &tv1, &res);
> printf("%ld.%6ld\n", res.tv_sec, res.tv_usec);
>
> return (0);
> }
>
More information about the freebsd-hackers
mailing list