svn commit: r280279 - head/sys/sys

Mon Apr 20 14:24:46 UTC 2015

On Mon, 20 Apr 2015, Konstantin Belousov wrote:

> On Mon, Apr 13, 2015 at 04:04:45PM -0400, Jung-uk Kim wrote:
>> Please try the attached patch.
>
>> Index: sys/amd64/amd64/pmap.c
>> ===================================================================
>> --- sys/amd64/amd64/pmap.c	(revision 281496)
>> +++ sys/amd64/amd64/pmap.c	(working copy)
>> @@ -412,7 +412,7 @@ static caddr_t crashdumpmap;
>>  static void	free_pv_chunk(struct pv_chunk *pc);
>>  static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
>>  static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
>> -static int	popcnt_pc_map_elem_pq(uint64_t elem);
>> +static int	popcnt_pc_map(uint64_t *pc_map);
>>  static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
>>  static void	reserve_pv_entries(pmap_t pmap, int needed,
>>  		    struct rwlock **lockp);
>> @@ -2979,7 +2979,7 @@ retry:
>>  }
>>
>>  /*
>> - * Returns the number of one bits within the given PV chunk map element.
>> + * Returns the number of one bits within the given PV chunk map.
>>   *
>>   * The erratas for Intel processors state that "POPCNT Instruction May
>>   * Take Longer to Execute Than Expected".  It is believed that the
>> @@ -2994,12 +2994,21 @@ retry:
>>   * 5th Gen Core: BDM85
>>   */
>>  static int
>> -popcnt_pc_map_elem_pq(uint64_t elem)
>> +popcnt_pc_map(uint64_t *pc_map)
>>  {
>> -	u_long result;
>> +	u_long count, result;
>> +	int field;
>>
>> -	__asm __volatile("xorl %k0,%k0;popcntq %1,%0"
>> -	    : "=&r" (result) : "rm" (elem));
>> +	result = 0;
>> +	if ((cpu_feature2 & CPUID2_POPCNT) != 0)
>> +		for (field = 0; field < _NPCM; field++) {
>> +			__asm __volatile("xorl %k0, %k0; popcntq %1, %0"
>> +			    : "=r" (count) : "m" (pc_map[field]));
>> +			result += count;
>> +		}
>> +	else
>> +		for (field = 0; field < _NPCM; field++)
>> +			result += bitcount64(pc_map[field]);
>>  	return (result);
>>  }
>>
>> @@ -3031,15 +3040,7 @@ reserve_pv_entries(pmap_t pmap, int needed, struct
>>  retry:
>>  	avail = 0;
>>  	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
>> -		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
>> -			free = bitcount64(pc->pc_map[0]);
>> -			free += bitcount64(pc->pc_map[1]);
>> -			free += bitcount64(pc->pc_map[2]);
>> -		} else {
>> -			free = popcnt_pc_map_elem_pq(pc->pc_map[0]);
>> -			free += popcnt_pc_map_elem_pq(pc->pc_map[1]);
>> -			free += popcnt_pc_map_elem_pq(pc->pc_map[2]);
>> -		}
>> +		free = popcnt_pc_map(pc->pc_map);
>>  		if (free == 0)
>>  			break;
>>  		avail += free;
>
> Yes, this worked for me the same way as for you, the argument is taken
> directly from memory, without temporary spill.  Is this due to silly
> inliner ?  Whatever the reason is, I think a comment should be added
> noting the subtlety.
>
> Otherwise, looks fine.

Erm, this looks silly.  It apparently works by making things too complicated
for the compiler to "optimize" (where one of the optimizations actually
gives pessimal spills).  Its main changes are:
- change the constraint from "rm" to "m".  This alone is not enough to
   get the compiler to use the natural memory operands without a trip
   through a register
- un-unroll a loop.  This alone makes no difference in my simpler test
   environment.  The compiler un-un-unrolls.  (The compiler also inlines
   all the static function whether asked to or not.  gcc shouldn't do
   this inlining for the 3 calls, and gcc -fno-inline-functions-called-once
   would never do it.  clang is missing support for -fno-inline-functions-
   called-once, and gcc48 vanished with a recent "upgrade" on freefall,
   so I couldn't test this case easily.  -fno-inline-functions-called-once
   should be the default for kernels.
- some other changes apparently confused clang into doing the right thing.

It works better to change the constraint to "r":

X #include <sys/types.h>
X 
X static __inline u_long
X popcntq(u_long mask)
X {
X 	u_long result;
X 
X 	__asm __volatile("xorl %k0,%k0; popcntq %1,%0" :
X 	    "=&r" (result) : "r" (mask));
X 	return (result);
X }
X 
X u_long x[3];
X 
X int
X test(void)
X {
X 	return (popcntq(x[0]) + popcntq(x[1]) + popcntq(x[2]));
X }

X ...
X 	.cfi_startproc
X # BB#0:                                 # %entry
X 	pushq	%rbp
X .Ltmp0:
X 	.cfi_def_cfa_offset 16
X .Ltmp1:
X 	.cfi_offset %rbp, -16
X 	movq	%rsp, %rbp
X .Ltmp2:
X 	.cfi_def_cfa_register %rbp
X 	movq	x(%rip), %rax
X 	#APP
X 	xorl	%ecx, %ecx
X 	popcntq	%rax, %rcx
X 	#NO_APP
X 	movq	x+8(%rip), %rax
X 	#APP
X 	xorl	%edx, %edx
X 	popcntq	%rax, %rdx
X 	#NO_APP
X 	addl	%ecx, %edx
X 	movq	x+16(%rip), %rcx
X 	#APP
X 	xorl	%eax, %eax
X 	popcntq	%rcx, %rax
X 	#NO_APP
X 	addl	%edx, %eax
X                                         # kill: EAX<def> EAX<kill> RAX<kill>
X 	popq	%rbp
X 	retq

I changed and tested the generic popcntq() since this is the correct
function to use and is easier to test.  With the old constraint, the

This version does extra load instructions, but those should have a
negative or zero cost since the loads always have to be done and
explicit loads may allow better scheduling (the asm moves them before
the xorl's which gives hard-codes reasonable scheduling).

Next, improve the asm by telling the compiler to load 0:

X 	__asm __volatile("popcntq %1,%0" :
X	    "=r" (result) : "r" (mask), "0" (0));

This avoids the ugliness with %k0 (the compiler knows the best way to
load a 64-bit 0 -- use xorl).  This generates the same code, except
the compiler uses more registers and moves one of the xorls earlier.
"rm" as a constraint still gives the spills.

Next, further improve the asm by moving the load of 0 out of it:

X 	result = 0;
X 	__asm __volatile("popcntq %1,%0" :
X	    "+r" (result) : "r" (mask));

This doesn't change the generated code.  "rm" still gives spills
(and "m" still gives a trip through a register).

Finally for this mail, but leaving many more compiler bugs to be
determined, further improve the asm by restoring "m" to a constraint
but with "m" disparaged:

X 	result = 0;
X 	__asm __volatile("popcntq %1,%0" :
X	    "+r,+r" (result) : "r,?m" (mask));

This works to get the register operand used here, but disparagement
seems to be broken in clang (and I never got it to do anything useful
in gcc -- I want alternatives in the main part of the asm, but that
seems to only be possible in insns in .md files).  Here clang prefers
"r" even if it is strongly disparaged relative to "m" ("!r,m").
clang seems to always prefer the first alternative.  Perhaps that is
correct since the 2 alternatives for the output constraint are the same.
No, since if the output is broken to "=m,+r", then the broken and
disparaged case is prefered.  Also, "r,+r" is a syntax error, but no
error is detected for "+r,r".

gcc42 tells me that it is "+r,"+r" that is the syntax error -- the "+"
apparently must be first and then it applies to both alternatives.

Disparagement seems to work in gcc42.  E.g., with the operand naturally
living in memory, "?m,r" still prefers the memory operand, but "!m,r"
prefers the register operand.  gcc generates suboptimal loads of 0,
so the hard-coded xorl is probably best for it:

X 	result = 0;
X 	__asm __volatile("popcntq %1,%0" :
X 	    "+r,r" (result) : "r,?m" (mask));

(This is supposed to be the best version.)  gcc42 output:

X 	movl	$0, %edx

First load of 0.  Should use xorl.

X 	movl	%edx, %ecx

Second load of 0.  Should use xorl so as to not depend on the previous
load.

X #APP
X 	popcntq x,%ecx

Use second load of 0.  popcntq cannot be assembled on i386, but I only
compiled to asm.

X #NO_APP
X 	movl	%edx, %eax

Third load of 0.  Probably well enough scheduled.

X #APP
X 	popcntq x+4,%eax

Use third load of 0.  Poorly scheduled.  Should use first load of 0.

X 	popcntq x+8,%edx

Use first load of 0.

X #NO_APP
X 	addl	%ecx, %eax
X 	addl	%edx, %eax

Can probably schedule these adds better too (start the first one earlier),
but that is easy for the CPU to do.

With "m" strongly disparaged ("r,!m"):

X 	movl	$0, %edx
X 	movl	%edx, %ecx
X 	movl	x, %eax
X #APP
X 	popcntq %eax,%ecx
X #NO_APP
X 	movl	%edx, %eax
X 	movl	x+4, %ebx
X #APP
X 	popcntq %ebx,%eax

First 2 popcntq's from the non-disparaged "r".

X 	popcntq x+8,%edx

But the disparagement wasn't strong enough for the third.  OK.

X #NO_APP
X 	addl	%ecx, %eax
X 	addl	%edx, %eax

Bruce