towards fixing profiling for SMP

Bruce Evans brde at optusnet.com.au
Thu Jun 14 05:11:10 UTC 2007


Please review this patch.  It mainly adds SMP locking that has a chance
of working (very slowly due to massive contention).

% diff -c2 ./amd64/include/profile.h~ ./amd64/include/profile.h
% *** ./amd64/include/profile.h~	Sun Apr  1 13:55:36 2007
% --- ./amd64/include/profile.h	Sun Apr  1 13:55:37 2007
% ***************
% *** 78,93 ****
%   #endif /* !__GNUCLIKE_ASM */
%   #else /* !GUPROF */
% ! #define	MCOUNT_DECL(s)	u_long s;
% ! #ifdef SMP
% ! extern int	mcount_lock;
% ! #define	MCOUNT_ENTER(s)	{ s = read_rflags(); disable_intr(); \
% !  			  while (!atomic_cmpset_acq_int(&mcount_lock, 0, 1)) \
% ! 			  	/* nothing */ ; }
% ! #define	MCOUNT_EXIT(s)	{ atomic_store_rel_int(&mcount_lock, 0); \
% ! 			  write_rflags(s); }
% ! #else
% ! #define	MCOUNT_ENTER(s)	{ s = read_rflags(); disable_intr(); }
% ! #define	MCOUNT_EXIT(s)	(write_rflags(s))
% ! #endif
%   #endif /* GUPROF */
% 
% --- 78,84 ----
%   #endif /* !__GNUCLIKE_ASM */
%   #else /* !GUPROF */
% ! #define	MCOUNT_DECL(s)		register_t s;
% ! #define	MCOUNT_ENTER(s)		do { s = intr_disable(); } while (0)
% ! #define	MCOUNT_EXIT(s)		(intr_restore(s))
%   #endif /* GUPROF */
% 
% diff -c2 ./i386/include/profile.h~ ./i386/include/profile.h
% *** ./i386/include/profile.h~	Sat Oct 28 11:04:58 2006
% --- ./i386/include/profile.h	Sat Nov  4 19:12:08 2006
% ***************
% *** 78,93 ****
%   #endif /* !__GNUCLIKE_ASM */
%   #else /* !GUPROF */
% ! #define	MCOUNT_DECL(s)	u_long s;
% ! #ifdef SMP
% ! extern int	mcount_lock;
% ! #define	MCOUNT_ENTER(s)	{ s = read_eflags(); disable_intr(); \
% !  			  while (!atomic_cmpset_acq_int(&mcount_lock, 0, 1)) \
% ! 			  	/* nothing */ ; }
% ! #define	MCOUNT_EXIT(s)	{ atomic_store_rel_int(&mcount_lock, 0); \
% ! 			  write_eflags(s); }
% ! #else
% ! #define	MCOUNT_ENTER(s)	{ s = read_eflags(); disable_intr(); }
% ! #define	MCOUNT_EXIT(s)	(write_eflags(s))
% ! #endif
%   #endif /* GUPROF */
% 
% --- 78,84 ----
%   #endif /* !__GNUCLIKE_ASM */
%   #else /* !GUPROF */
% ! #define	MCOUNT_DECL(s)		register_t s;
% ! #define	MCOUNT_ENTER(s)		do { s = intr_disable(); } while (0)
% ! #define	MCOUNT_EXIT(s)		(intr_restore(s))
%   #endif /* GUPROF */
% 
% diff -c2 ./libkern/mcount.c~ ./libkern/mcount.c
% *** ./libkern/mcount.c~	Wed Jun 13 06:11:54 2007
% --- ./libkern/mcount.c	Wed Jun 13 06:12:14 2007
% ***************
% *** 34,40 ****
%   #include <sys/gmon.h>
%   #ifdef _KERNEL
% - #ifndef GUPROF
%   #include <sys/systm.h>
% - #endif
%   #include <vm/vm.h>
%   #include <vm/vm_param.h>
% --- 34,38 ----
% ***************
% *** 43,46 ****
% --- 41,101 ----
% 
%   /*
% +  * Locking strategy: use a simple spin mutex, but don't wait very long
% +  * for it.  When we don't acquire it, just give up.  We avoid using a
% +  * normal mutex since normal mutex code is profiled and might be
% +  * inefficent.  Not waiting very long avoids deadlock.  Our callers
% +  * should have used the usual anti-deadlock strategy of disabling
% +  * interrupts on the current CPU, but that is especially insufficient
% +  * for us since it doesn't affect debugger traps and NMIs, and at least
% +  * the debugger code is profiled.
% +  *
% +  * Note: the atomic functions used here, like all functions used in this
% +  * file, must be compiled without profiling.  This normally happens for
% +  * the atomic functions because they are inline.
% +  */
% + 
% + static __inline int
% + mcount_trylock(struct gmonparam *p, int want_both)
% + {
% + #ifdef SMP
% + 	int timeout;
% + 
% + 	timeout = 1000;		/* XXX ~= 10 times #statements in mcount(). */
% + 	do {
% + 		/* XXX having 2 "on" states complicates things. */
% + #ifdef GUPROF
% + 		if (atomic_cmpset_acq_int(&p->state, GMON_PROF_HIRES,
% + 		    GMON_PROF_BUSY))
% + 			return (GMON_PROF_HIRES);
% + #endif
% + 		if (want_both && atomic_cmpset_acq_int(&p->state, GMON_PROF_ON,
% + 		    GMON_PROF_BUSY))
% + 			return (GMON_PROF_ON);
% + 	} while (--timeout != 0);
% + #else /* !SMP */
% + 	/* Ugliness for efficiency. */
% + #ifdef GUPROF
% + 	if (p->state == GMON_PROF_HIRES) {
% + 		/* XXX want acq. */
% + 		atomic_store_rel_int(&p->state, GMON_PROF_BUSY);
% + 		return (GMON_PROF_HIRES);
% + 	}
% + #endif
% + 	if (want_both && p->state == GMON_PROF_ON) {
% + 		atomic_store_rel_int(&p->state, GMON_PROF_BUSY);
% + 		return (GMON_PROF_ON);
% + 	}
% + #endif /* SMP */
% + 	return (GMON_PROF_BUSY); /* Might actually be OFF or ERROR. */
% + }
% + 
% + static __inline void
% + mcount_unlock(struct gmonparam *p, int ostate)
% + {
% + 
% + 	atomic_store_rel_int(&p->state, ostate);
% + }
% + 
% + /*
%    * mcount is called on entry to each function compiled with the profiling
%    * switch set.  _mcount(), which is declared in a machine-dependent way
% ***************
% *** 68,71 ****
% --- 123,127 ----
%   	struct gmonparam *p;
%   	long toindex;
% + 	int ostate;
%   #ifdef _KERNEL
%   	MCOUNT_DECL(s)
% ***************
% *** 73,89 ****
% 
%   	p = &_gmonparam;
% ! #ifndef GUPROF			/* XXX */
%   	/*
% ! 	 * check that we are profiling
% ! 	 * and that we aren't recursively invoked.
%   	 */
% ! 	if (p->state != GMON_PROF_ON)
%   		return;
%   #endif
%   #ifdef _KERNEL
%   	MCOUNT_ENTER(s);
% - #else
% - 	p->state = GMON_PROF_BUSY;
%   #endif
% 
%   #ifdef _KERNEL
% --- 129,145 ----
% 
%   	p = &_gmonparam;
% ! #ifndef GUPROF
%   	/*
% ! 	 * Quick check that we are profiling.  In caller for GUPROF, and
% ! 	 * should always be there.
%   	 */
% ! 	if (p->state == GMON_PROF_OFF)
%   		return;
%   #endif
%   #ifdef _KERNEL
%   	MCOUNT_ENTER(s);
%   #endif
% + 	if ((ostate = mcount_trylock(p, 1)) == GMON_PROF_BUSY)
% + 		return;
% 
%   #ifdef _KERNEL
% ***************
% *** 101,105 ****
% 
%   #ifdef GUPROF
% ! 	if (p->state == GMON_PROF_HIRES) {
%   		/*
%   		 * Count the time since cputime() was previously called
% --- 157,161 ----
% 
%   #ifdef GUPROF
% ! 	if (ostate == GMON_PROF_HIRES) {
%   		/*
%   		 * Count the time since cputime() was previously called
% ***************
% *** 140,144 ****
%   	 */
%   	frompc = MCOUNT_FROMPC_INTR(selfpc);
% ! 	if ((frompc - p->lowpc) < p->textsize)
%   		frompci = frompc - p->lowpc;
%   #endif
% --- 196,201 ----
%   	 */
%   	frompc = MCOUNT_FROMPC_INTR(selfpc);
% ! 	/* XXX wording of above paragraph and the macro (exc not intr...). */
% ! 	if (frompc - p->lowpc < p->textsize)
%   		frompci = frompc - p->lowpc;
%   #endif
% ***************
% *** 224,235 ****
%   	}
%   done:
%   #ifdef _KERNEL
%   	MCOUNT_EXIT(s);
% - #else
% - 	p->state = GMON_PROF_ON;
%   #endif
%   	return;
%   overflow:
% ! 	p->state = GMON_PROF_ERROR;
%   #ifdef _KERNEL
%   	MCOUNT_EXIT(s);
% --- 281,291 ----
%   	}
%   done:
% + 	mcount_unlock(p, ostate);
%   #ifdef _KERNEL
%   	MCOUNT_EXIT(s);
%   #endif
%   	return;
%   overflow:
% ! 	mcount_unlock(p, GMON_PROF_ERROR);
%   #ifdef _KERNEL
%   	MCOUNT_EXIT(s);
% ***************
% *** 251,256 ****
% --- 307,315 ----
%   	struct gmonparam *p;
%   	uintfptr_t selfpcdiff;
% + 	int ostate;
% 
%   	p = &_gmonparam;
% + 	if ((ostate = mcount_trylock(p, 0)) == GMON_PROF_BUSY)
% + 		return;
%   	selfpcdiff = selfpc - (uintfptr_t)p->lowpc;
%   	if (selfpcdiff < p->textsize) {
% ***************
% *** 267,270 ****
% --- 326,330 ----
%   		*p->mexitcount_count += p->mexitcount_overhead;
%   	}
% + 	mcount_unlock(p, ostate);
%   }
%

Notes:

The old MD macros are not really needed and had many bugs.  "MI" locking
based on atomic_cmpset is no harder than for mutexes, and with it the need
for the macros goes away.

The main bug in the old macros is that using mcount_lock gives deadlock
if mcount is reentered.  This deadlock is very easy to arrange by
trying to trace through mcount using a debugger.  Debugger routines
aren't profiling-routines, so they are compiled using -pg and thus
call mcount.  The new code handles this deadlock possibility as a
special case -- attempts to reenter fail after spinning for a long
time.  Arches other than amd64 and i386 are missing mcount_lock and
thus its deadlock.

Disabling of interrupts doesn't cause deadlock, but it doesn't actually
prevent reentering mcount either.  It could be removed if the general
deadlock avoidance mechanism works well enough.  I haven't tried this
or removing the macros entirely, but I made the interrupt disabling and
thus the macros "MI" modulo the problem that the intr_disable() API isn't
completely MI.  For sparc64 and sun4v, it doesn't disable _all_ interrupts,
so the mcount macros for these arches must use lower-level interrupt
disabling.  I suspect that most uses of intr_disable() should be using
the lower-level interrupt disabling since they are very low-level (kdb,
witness).  All other arches just use intr_disable().

Notes on mcount.c are supposed to be mainly in its comments.  Other notes:
- kernel locking becomes more similar to userland.  Basically, we just set
   the state to GMON_PROF_BUSY to prevent interference.  Userland should
   be using similar atomic ops to do this in the threaded case.
- I want to do a single atomic cmpset to swap betwen the busy and unbusy
   states, but couldn't get this to work like I wanted because there are
   more than 2 states.
- Everything needs to be per-CPU to avoid contention, but implementing that
   is too much work for me.  I've only implemented per-CPU timers.

Bruce


More information about the freebsd-amd64 mailing list