cvs commit: src/sys/i386/i386 apic_vector.s src/sys/i386/isa
atpic_vector.s
Bruce Evans
bde at zeta.org.au
Mon Feb 2 03:24:57 PST 2004
On Mon, 2 Feb 2004, Bruce Evans wrote:
> On Mon, 2 Feb 2004, Andy Farkas wrote:
>
> > On Wed, 28 Jan 2004, John Baldwin wrote:
> >
> > > Modified files:
> > > sys/i386/i386 apic_vector.s
> > > sys/i386/isa atpic_vector.s
> > > Log:
> > > Optimize the i386 interrupt entry code to not reload the segment registers
> > > if they already contain the correct kernel selectors.
> >
> > What effect on performance does this change have? It seems to be a rather
> > significant change to an important code path, or am I totally confused..?
>
> I measured it in userland and saw about -1 cycles/interrupt on an AthlonXP
> and about -22 cycles/interrupt on an old Celeron (negative means a
> pessimization).
Bah, the benchmark that gave these results was very buggy. Here is a
better version. Results first (for a random pattern to simulate 75% of
interrupts coming drom userland):
Celeron 366:
-20 cycles/interrupt optimization
Celeron 366 with partial register stall pessimization backed out:
No significant change (measured -0.05 cycles/interrupt less)
P2 (or whatever freefall is):
Similar to Celeron. Machine too loaded for accurate test.
AthlonXP:
+4 cycles/interrupt
Best cases for this and some other percentages:
100% user: Celeron 0- AthlonXP -1
75% user: Celeron 0+ AthlonXP +4
50% user: Celeron +2 AthlonXP +9
25% user: Celeron +9 AthlonXP +15
0% user: Celeron +20 AthlonXP +21
The branch prediction seems to be too good to be true.
%%%
#include <sys/types.h>
#include <machine/cpufunc.h>
#include <stdio.h>
#include <stdlib.h>
static void
testnewway(void)
{
__asm(" \n\
KDSEL = 0x2f \n\
KPSEL = 0x2f \n\
pushal \n\
pushl %ds \n\
pushl %es \n\
pushl %fs \n\
mov %fs,%ax \n\
cmp $KPSEL,%ax \n\
je 1f \n\
mov $KDSEL,%ax \n\
mov %ax,%ds \n\
mov %ax,%es \n\
mov $KPSEL,%ax \n\
mov %ax,%fs \n\
1: \n\
popl %fs \n\
popl %es \n\
popl %ds \n\
popal \n\
");
}
static void
testoldway(void)
{
__asm(" \n\
KDSEL = 0x2f \n\
KPSEL = 0x2f \n\
pushal \n\
pushl %ds \n\
pushl %es \n\
pushl %fs \n\
movl $KDSEL,%eax \n\
mov %ax,%ds \n\
mov %ax,%es \n\
movl $KPSEL,%eax \n\
mov %ax,%fs \n\
1: \n\
popl %fs \n\
popl %es \n\
popl %ds \n\
popal \n\
");
}
static void
testnewwayfix1(void)
{
__asm(" \n\
KDSEL = 0x2f \n\
KPSEL = 0x2f \n\
pushal \n\
pushl %ds \n\
pushl %es \n\
pushl %fs \n\
mov %fs,%ax \n\
cmp $KPSEL,%ax \n\
je 1f \n\
movl $KDSEL,%eax \n\
mov %ax,%ds \n\
mov %ax,%es \n\
movl $KPSEL,%eax \n\
mov %ax,%fs \n\
1: \n\
popl %fs \n\
popl %es \n\
popl %ds \n\
popal \n\
");
}
static void
testnewwayfix2(void)
{
/*
* New way with a different way of avoiding the partial register
* stores (use lots of segment override prefixes).
*
* It seems that gcc now understands "mov %ax,%ds" like I did in
* 1986, but that understanding is wrong :-). gcc leaves out the
* operand size prefix, but the prefix is apparently needed to
* actually mov from %ax. Without it the mov is apparently from
* %eax and there is a partial register store if we only load %ax.
* gcc refuses to produce the prefix for "mov %eax,%ds".
*
* gcc understands mov's from segment registers better to general
* registers better. Then it is clear that the target may be
* either 16 or 32 bits. Intel now documents what happens in at
* least this case. IIRC, it says that the operand size prefix
* works as should be expected, and the top 16 bits of the target
* are set to an indeterminate value. See the commit logs for
* <machine/cpufunc.h> for where I shot down a pesimization of
* this (we avoid using the operand size prefix, so we get garbage
* in the top bits).
*
* This subset of the benchmark is just to demonstrate that using
* the operand size prefix to force 16-bit operations is just a
* pessimization. It is only a small pessimization though. Until
* today (2004/01/04) I didn't understand why getting the prefix
* wrong was such a large pessimization. It was due to partial
* register stalls more that the prefix.
*/
__asm(" \n\
KDSEL = 0x2f \n\
KPSEL = 0x2f \n\
pushal \n\
pushl %ds \n\
pushl %es \n\
pushl %fs \n\
mov %fs,%ax \n\
cmp $KPSEL,%ax \n\
je 1f \n\
mov $KDSEL,%ax \n\
.byte 0x66 \n\
mov %ax,%ds \n\
.byte 0x66 \n\
mov %ax,%es \n\
mov $KPSEL,%ax \n\
.byte 0x66 \n\
mov %ax,%fs \n\
1: \n\
popl %fs \n\
popl %es \n\
popl %ds \n\
popal \n\
");
}
#define SIZE 1000000
unsigned char state[SIZE];
static void
randfs(int i)
{
if (state[i] != 0)
__asm("pushl %cs; popl %fs");
else
__asm("pushl %ds; popl %fs");
}
int
main(void)
{
double statetot;
unsigned long long start;
int i;
statetot = 0;
for (i = 0; i < SIZE; i++) {
#if 0
state[i] = (random() >> 30) & 1;
#else
/*
* Bias the state to simulate that the system probably
* spends most of its time in user mode (state[i] != 0).
* Guess 75% in user mode.
*/
state[i] = (((random() >> 28) & 3) != 0);
#endif
statetot += state[i];
}
start = rdtsc();
for (i = 0; i < SIZE; i++) {
randfs(i);
testnewway();
}
printf("%llu\n", rdtsc() - start);
start = rdtsc();
for (i = 0; i < SIZE; i++) {
randfs(i);
testoldway();
}
printf("%llu\n", rdtsc() - start);
start = rdtsc();
for (i = 0; i < SIZE; i++) {
randfs(i);
testnewwayfix1();
}
printf("%llu\n", rdtsc() - start);
start = rdtsc();
for (i = 0; i < SIZE; i++) {
randfs(i);
testnewwayfix2();
}
printf("%llu\n", rdtsc() - start);
printf("state average %.3f\n", statetot / SIZE);
return (0);
}
%%%
Bruce
More information about the cvs-all
mailing list