[CFR] mge driver / elf reloc

Tue Jul 22 07:16:27 UTC 2014

On Mon, 21 Jul 2014, Mark Linimon wrote:

> On Tue, Jul 22, 2014 at 03:53:10AM +1000, Bruce Evans wrote:
>> This is with gcc.  clang doesn't work on ia64 and/or pluto.
>
> Since Marcel has dropped support for ia64, and in fact removed ia64-
> specific code from -HEAD, I'm not sure how much good this analysis
> will accomplish :-)

ia64/pluto is just an example of an arch with strict alignment
requirements.  clang is broken for it so I could only test with
gcc.

This analysis applies to all non-x86 arches in FreeBSD cluster
machines, since there aren't many others and the only other one
(sparc64/flame) also has strict alignment requirements.  clang
is broken on it too, so I could only test with gcc:

% #include <sys/endian.h>
% 
% struct foo {
% 	int	x;
% } __packed;
% 
% struct foo x;
% 
% static __inline uint32_t
% xle32dec(const void *_p)
% {
% 	uint32_t _t;
% 
% 	__builtin_memcpy(&_t, _p, sizeof(_t));
% 	return (_t);
% }
% 
% static __inline void
% xle32enc(void *_p, uint32_t _u)
% {
% 
% 	__builtin_memcpy(_p, &_u, sizeof(_u));
% }
% 
% uint32_t
% q(void)
% {
% 	return xle32dec(&x);
% }
% 
% void
% r(void)
% {
% 	return xle32enc(&x, 1);
% }

This tests the memcpy versions.

__packed gives the expected mess:

% % 	.file	"z.c"
% 	.section	".text"
% 	.align 4
% 	.align 32
% 	.global r
% 	.type	r, #function
% 	.proc	020
% r:
% 	.register	%g2, #scratch
% 	.register	%g3, #scratch
% 	add	%sp, -208, %sp
% 	mov	1, %g1
% 	st	%g1, [%sp+2235]
% 	sethi	%hi(x), %g2
% 	or	%g2, %lo(x), %g3
% 	ldub	[%sp+2235], %g1
% 	stb	%g1, [%g2+%lo(x)]
% 	ldub	[%sp+2236], %g1
% 	stb	%g1, [%g3+1]
% 	ldub	[%sp+2237], %g1
% 	stb	%g1, [%g3+2]
% 	ldub	[%sp+2238], %g1
% 	stb	%g1, [%g3+3]
% 	jmp	%o7+8
% 	 sub	%sp, -208, %sp
% 	.size	r, .-r
% 	.align 4
% 	.align 32
% 	.global q
% 	.type	q, #function
% 	.proc	016
% q:
% 	add	%sp, -208, %sp
% 	sethi	%hi(x), %g1
% 	or	%g1, %lo(x), %g2
% 	ldub	[%g1+%lo(x)], %g1
% 	stb	%g1, [%sp+2235]
% 	ldub	[%g2+1], %g1
% 	stb	%g1, [%sp+2236]
% 	ldub	[%g2+2], %g1
% 	stb	%g1, [%sp+2237]
% 	ldub	[%g2+3], %g1
% 	stb	%g1, [%sp+2238]
% 	lduw	[%sp+2235], %o0
% 	jmp	%o7+8
% 	 sub	%sp, -208, %sp
% 	.size	q, .-q
% 	.common	x,4,1
% 	.ident	"GCC: (GNU) 4.2.1 20070831 patched [FreeBSD]"

I think both functions copy the memory bytewise (4+4 memory references)
and do 1 load of the final copy or 1 store to the temporary copy.  So
the memcpy is not virtual, and the memcpy versions might be worse than
the -current versions which should use 4+1 memory references plus lots
of shifts and masks on a registers.  Register operations are faster but
there are many more of them.

Removing __packed gives the expected direct accesses:

% 	.file	"z.c"
% 	.section	".text"
% 	.align 4
% 	.align 32
% 	.global r
% 	.type	r, #function
% 	.proc	020
% r:
% 	.register	%g2, #scratch
% 	add	%sp, -208, %sp
% 	mov	1, %g2
% 	sethi	%hi(x), %g1
% 	st	%g2, [%g1+%lo(x)]
% 	jmp	%o7+8
% 	 sub	%sp, -208, %sp
% 	.size	r, .-r
% 	.align 4
% 	.align 32
% 	.global q
% 	.type	q, #function
% 	.proc	016
% q:
% 	add	%sp, -208, %sp
% 	sethi	%hi(x), %g1
% 	lduw	[%g1+%lo(x)], %o0
% 	jmp	%o7+8
% 	 sub	%sp, -208, %sp
% 	.size	q, .-q
% 	.common	x,4,4
% 	.ident	"GCC: (GNU) 4.2.1 20070831 patched [FreeBSD]"

The memcpy's seem to be virtual now.  Maybe the compiler is avoiding
the shifts and masks for the packed case intentionally.

Timing tests on flame and pluto showed some problems.  The memcpy
versions are mostly faster in the non-__packed case and slower in the
__packed case.  This is as expected.  The above case where the compiler
virtulalize the memcpy is especially slow, as expected, but there are
some other slow cases, an lots of differences between flame and pluto.
The __packed case is 4-20 times slower.

Bruce