svn commit: r238907 - projects/calloutng/sys/kern

Mon Jul 30 14:02:13 UTC 2012

On 7/30/12, Davide Italiano <davide at freebsd.org> wrote:
> Author: davide
> Date: Mon Jul 30 13:50:37 2012
> New Revision: 238907
> URL: http://svn.freebsd.org/changeset/base/238907
>
> Log:
>   - Fix a LOR deadlock dropping the callout lock while executing the
> handler
>   directly from hardware interrupt context.
>
>   - Add migration support for callouts which runs from hw interrupt
> context.
>
>   - Refactor a couple of comments to reflect the new world order.
>
>   TODO: implement proper stats for direct callouts.
>
>   Just a note: I'd like to thank flo@ for his invaluable help in testing
> and
>   issues, mav@ that helped me in tackling the problem and Yahoo! which
>   provided access to the machines on which tests were run.
>
>   Reported by:		avg, flo [1]
>   Discused with:		mav
>
> Modified:
>   projects/calloutng/sys/kern/kern_timeout.c
>
> Modified: projects/calloutng/sys/kern/kern_timeout.c
> ==============================================================================
> --- projects/calloutng/sys/kern/kern_timeout.c	Mon Jul 30 12:25:20
> 2012	(r238906)
> +++ projects/calloutng/sys/kern/kern_timeout.c	Mon Jul 30 13:50:37
> 2012	(r238907)
> @@ -91,45 +91,62 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpca
>  int callwheelsize, callwheelmask;
>
>  /*
> - * The callout cpu migration entity represents informations necessary for
> - * describing the migrating callout to the new callout cpu.
> + * The callout cpu exec entities represent informations necessary for
> + * describing the state of callouts currently running on the CPU and the
> ones
> + * necessary for migrating callouts to the new callout cpu. In particular,
> + * the first entry of the array cc_exec_entity holds informations for
> callout
> + * running in SWI thread context, while the second one holds informations
> + * for callout running directly from hardware interrupt context.
>   * The cached informations are very important for deferring migration when
>   * the migrating callout is already running.
>   */
> -struct cc_mig_ent {
> +struct cc_exec {
> +	struct callout		*cc_next;
> +	struct callout		*cc_curr;
>  #ifdef SMP
>  	void			(*ce_migration_func)(void *);
>  	void			*ce_migration_arg;
>  	int			ce_migration_cpu;
>  	struct bintime		ce_migration_time;
>  #endif
> +	int			cc_cancel;
> +	int			cc_waiting;
>  };
>  	
>  /*
> - * There is one struct callout_cpu per cpu, holding all relevant
> + * There is one struct callou_cpu per cpu, holding all relevant
>   * state for the callout processing thread on the individual CPU.
>   */
>  struct callout_cpu {
> -	struct cc_mig_ent	cc_migrating_entity;
> +	struct cc_exec 		cc_exec_entity[2];
>  	struct mtx		cc_lock;
>  	struct callout		*cc_callout;
>  	struct callout_tailq	*cc_callwheel;
>  	struct callout_tailq	cc_expireq;		
>  	struct callout_list	cc_callfree;
> -	struct callout		*cc_next;
> -	struct callout		*cc_curr;
>  	struct bintime 		cc_firstevent;
>  	struct bintime 		cc_lastscan;
>  	void			*cc_cookie;
> -	int			cc_cancel;
> -	int			cc_waiting;
>  };
>
> +#define	cc_exec_curr		cc_exec_entity[0].cc_curr
> +#define	cc_exec_next		cc_exec_entity[0].cc_next
> +#define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
> +#define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
> +#define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
> +#define	cc_exec_next_dir	cc_exec_entity[1].cc_next
> +#define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
> +#define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
> +
>  #ifdef SMP
> -#define	cc_migration_func	cc_migrating_entity.ce_migration_func
> -#define	cc_migration_arg	cc_migrating_entity.ce_migration_arg
> -#define	cc_migration_cpu	cc_migrating_entity.ce_migration_cpu
> -#define	cc_migration_time	cc_migrating_entity.ce_migration_time
> +#define	cc_migration_func	cc_exec_entity[0].ce_migration_func
> +#define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
> +#define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
> +#define	cc_migration_time	cc_exec_entity[0].ce_migration_time
> +#define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
> +#define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
> +#define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
> +#define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
>
>  struct callout_cpu cc_cpu[MAXCPU];
>  #define	CPUBLOCK	MAXCPU
> @@ -156,6 +173,9 @@ struct callout_cpu cc_cpu;
>
>  static int timeout_cpu;
>  void (*callout_new_inserted)(int cpu, struct bintime bt) = NULL;
> +static struct callout *
> +softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
> +    int *lockcalls, int *gcalls, int direct);
>
>  static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
>
> @@ -180,15 +200,18 @@ static MALLOC_DEFINE(M_CALLOUT, "callout
>   * Resets the migration entity tied to a specific callout cpu.
>   */
>  static void
> -cc_cme_cleanup(struct callout_cpu *cc)
> +cc_cme_cleanup(struct callout_cpu *cc, int direct)
>  {
> -
> +	
> +	cc->cc_exec_entity[direct].cc_curr = NULL;	
> +	cc->cc_exec_entity[direct].cc_next = NULL;
> +	cc->cc_exec_entity[direct].cc_cancel = 0;
> +	cc->cc_exec_entity[direct].cc_waiting = 0;
>  #ifdef SMP
> -	cc->cc_migration_cpu = CPUBLOCK;
> -	cc->cc_migration_time.sec = 0;
> -	cc->cc_migration_time.frac = 0;
> -	cc->cc_migration_func = NULL;
> -	cc->cc_migration_arg = NULL;
> +	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
> +	bintime_clear(&cc->cc_exec_entity[direct].ce_migration_time);
> +	cc->cc_exec_entity[direct].ce_migration_func = NULL;
> +	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
>  #endif
>  }
>
> @@ -196,11 +219,12 @@ cc_cme_cleanup(struct callout_cpu *cc)
>   * Checks if migration is requested by a specific callout cpu.
>   */
>  static int
> -cc_cme_migrating(struct callout_cpu *cc)
> +cc_cme_migrating(struct callout_cpu *cc, int direct)
>  {
>
>  #ifdef SMP
> -	return (cc->cc_migration_cpu != CPUBLOCK);
> +	
> +	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
>  #else
>  	return (0);
>  #endif
> @@ -246,7 +270,8 @@ callout_cpu_init(struct callout_cpu *cc)
>  		TAILQ_INIT(&cc->cc_callwheel[i]);
>  	}
>  	TAILQ_INIT(&cc->cc_expireq);
> -	cc_cme_cleanup(cc);
> +	for (i = 0; i < 2; i++)
> +		cc_cme_cleanup(cc, i);
>  	if (cc->cc_callout == NULL)
>  		return;
>  	for (i = 0; i < ncallout; i++) {
> @@ -355,7 +380,8 @@ callout_process(struct bintime *now)
>  	struct callout *tmp;
>  	struct callout_cpu *cc;
>  	struct callout_tailq *sc;
> -	int cpu, first, future, last, need_softclock;
> +	int cpu, first, future, gcalls, mpcalls, last, lockcalls,
> +	    need_softclock;
>
>  	/*
>  	 * Process callouts at a very low cpu priority, so we don't keep the
> @@ -376,7 +402,8 @@ callout_process(struct bintime *now)
>  	first &= callwheelmask;
>  	for (;;) {	
>  		sc = &cc->cc_callwheel[first];
> -		TAILQ_FOREACH(tmp, sc, c_links.tqe) {
> +		tmp = TAILQ_FIRST(sc);
> +		while (tmp != NULL) {	
>  			next = tmp->c_time;
>  			bintime_sub(&next, &tmp->c_precision);
>  			if (bintime_cmp(&next, now, <=)) {
> @@ -385,17 +412,20 @@ callout_process(struct bintime *now)
>  				 * directly from hardware interrupt context.
>  				 */
>  				if (tmp->c_flags & CALLOUT_DIRECT) {
> -					tmp->c_func(tmp->c_arg);
>  					TAILQ_REMOVE(sc, tmp, c_links.tqe);
> -			                tmp->c_flags &= ~CALLOUT_PENDING;
> +					tmp = softclock_call_cc(tmp, cc,
> +					    &mpcalls, &lockcalls, &gcalls, 1);
>  				} else {
>  					TAILQ_INSERT_TAIL(&cc->cc_expireq,
>  					    tmp, c_staiter);
>  					TAILQ_REMOVE(sc, tmp, c_links.tqe);
>  					tmp->c_flags |= CALLOUT_PROCESSED;
>  					need_softclock = 1;
> +					tmp = TAILQ_NEXT(tmp, c_links.tqe);
>  				}
>  			}	
> +			else
> +				tmp = TAILQ_NEXT(tmp, c_links.tqe);
>  		}	
>  		if (first == last)
>  			break;
> @@ -561,11 +591,12 @@ callout_cc_add(struct callout *c, struct
>  }
>
>  static void
> -callout_cc_del(struct callout *c, struct callout_cpu *cc)
> +callout_cc_del(struct callout *c, struct callout_cpu *cc, int direct)
>  {
> -
> -	if (cc->cc_next == c)
> -		cc->cc_next = TAILQ_NEXT(c, c_staiter);
> +	if (direct && cc->cc_exec_next_dir == c)
> +		cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
> +	else if (!direct && cc->cc_exec_next == c)
> +		cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>  	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
>  		c->c_func = NULL;
>  		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
> @@ -574,13 +605,13 @@ callout_cc_del(struct callout *c, struct
>
>  static struct callout *
>  softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
> -    int *lockcalls, int *gcalls)
> +    int *lockcalls, int *gcalls, int direct)
>  {
>  	void (*c_func)(void *);
>  	void *c_arg;
>  	struct lock_class *class;
>  	struct lock_object *c_lock;
> -	int c_flags, sharedlock;
> +	int c_flags, flags, sharedlock;
>  #ifdef SMP
>  	struct callout_cpu *new_cc;
>  	void (*new_func)(void *);
> @@ -595,7 +626,14 @@ softclock_call_cc(struct callout *c, str
>  	static timeout_t *lastfunc;
>  #endif
>
> -	cc->cc_next = TAILQ_NEXT(c, c_staiter);
> +	if (direct)
> +		cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
> +	else {
> +		if ((c->c_flags & CALLOUT_PROCESSED) == 0) 	
> +			cc->cc_exec_next = TAILQ_NEXT(c, c_links.tqe);
> +		else
> +			cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
> +	}
>  	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
>  	sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
>  	c_lock = c->c_lock;
> @@ -606,8 +644,8 @@ softclock_call_cc(struct callout *c, str
>  		c->c_flags = CALLOUT_LOCAL_ALLOC;
>  	else
>  		c->c_flags &= ~CALLOUT_PENDING;
> -	cc->cc_curr = c;
> -	cc->cc_cancel = 0;
> +	cc->cc_exec_entity[direct].cc_curr = c;
> +	cc->cc_exec_entity[direct].cc_cancel = 0;
>  	CC_UNLOCK(cc);
>  	if (c_lock != NULL) {
>  		class->lc_lock(c_lock, sharedlock);
> @@ -615,13 +653,12 @@ softclock_call_cc(struct callout *c, str
>  		 * The callout may have been cancelled
>  		 * while we switched locks.
>  		 */
> -		if (cc->cc_cancel) {
> +		if (cc->cc_exec_entity[direct].cc_cancel) {
>  			class->lc_unlock(c_lock);
>  			goto skip;
>  		}
>  		/* The callout cannot be stopped now. */
> -		cc->cc_cancel = 1;
> -
> +		cc->cc_exec_entity[direct].cc_cancel = 1;
>  		if (c_lock == &Giant.lock_object) {
>  			(*gcalls)++;
>  			CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
> @@ -639,11 +676,13 @@ softclock_call_cc(struct callout *c, str
>  #ifdef DIAGNOSTIC
>  	binuptime(&bt1);
>  #endif
> -	THREAD_NO_SLEEPING();
> +	if (!direct)
> +		THREAD_NO_SLEEPING();
>  	SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
>  	c_func(c_arg);
>  	SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
> -	THREAD_SLEEPING_OK();
> +	if (!direct)
> +		THREAD_SLEEPING_OK();

I didn't look into the full patch, however I have a question about
this: was it necessary because now this is running in interrupt
context so it can catch the nested sleeping check  case? Or there is
something else?
I think it would be a good thing to keep up THREAD_NO_SLEEPING() also
in the direct dispatch case.

Attilio

-- 
Peace can only be achieved by understanding - A. Einstein