ule+smp: small optimization for turnstile priority lending
Attilio Rao
attilio at freebsd.org
Tue Sep 18 23:02:00 UTC 2012
On Tue, Sep 18, 2012 at 8:07 PM, Andriy Gapon <avg at freebsd.org> wrote:
> on 18/09/2012 19:50 Attilio Rao said the following:
>> On 9/18/12, Andriy Gapon <avg at freebsd.org> wrote:
>>>
>>> Here is a snippet that demonstrates the issue on a supposedly fully loaded
>>> 2-processor system:
>>>
>>> 136794 0 3670427870244462 KTRGRAPH group:"thread", id:"Xorg tid 102818",
>>> state:"running", attributes: prio:122
>>>
>>> 136793 0 3670427870241000 KTRGRAPH group:"thread", id:"cc1plus tid
>>> 111916",
>>> state:"yielding", attributes: prio:183, wmesg:"(null)", lockname:"(null)"
>>>
>>> 136792 1 3670427870240829 KTRGRAPH group:"thread", id:"idle: cpu1 tid
>>> 100004",
>>> state:"running", attributes: prio:255
>>>
>>> 136791 1 3670427870239520 KTRGRAPH group:"load", id:"CPU 1 load",
>>> counter:0,
>>> attributes: none
>>>
>>> 136790 1 3670427870239248 KTRGRAPH group:"thread", id:"firefox tid
>>> 113473",
>>> state:"blocked", attributes: prio:122, wmesg:"(null)", lockname:"unp_mtx"
>>>
>>> 136789 1 3670427870237697 KTRGRAPH group:"load", id:"CPU 0 load",
>>> counter:2,
>>> attributes: none
>>>
>>> 136788 1 3670427870236394 KTRGRAPH group:"thread", id:"firefox tid
>>> 113473",
>>> point:"wokeup", attributes: linkedto:"Xorg tid 102818"
>>>
>>> 136787 1 3670427870236145 KTRGRAPH group:"thread", id:"Xorg tid 102818",
>>> state:"runq add", attributes: prio:122, linkedto:"firefox tid 113473"
>>>
>>> 136786 1 3670427870235981 KTRGRAPH group:"load", id:"CPU 1 load",
>>> counter:1,
>>> attributes: none
>>>
>>> 136785 1 3670427870235707 KTRGRAPH group:"thread", id:"Xorg tid 102818",
>>> state:"runq rem", attributes: prio:176
>>>
>>> 136784 1 3670427870235423 KTRGRAPH group:"thread", id:"Xorg tid 102818",
>>> point:"prio", attributes: prio:176, new prio:122, linkedto:"firefox tid
>>> 113473"
>>>
>>> 136783 1 3670427870202392 KTRGRAPH group:"thread", id:"firefox tid
>>> 113473",
>>> state:"running", attributes: prio:104
>>>
>>> See how how the Xorg thread was forced from CPU 1 to CPU 0 where it
>>> preempted
>>> cc1plus thread (I do have preemption enabled) only to leave CPU 1 with zero
>>> load.
>>
>> I think that the idea is bright, but I have reservations against the
>> implementation because it seems to me there are too many layering
>> violations.
>
> Just one - for a layer between tunrstile and scheduler :-)
> But I agree.
>
>> What is suggest is somewhat summarized like that:
>> - Add a new SRQ_WILLSLEEP or the name you prefer
>> - Add a new "flags" argument to sched_lend_prio() (both ule and 4bsd)
>> and sched_thread_priority (ule only)
>> - sched_thread_priority() will pass down the new flag to sched_add()
>> which passed down to sched_pickcpu().
>>
>> This way sched_pickcpu() has the correct knowledge of what is going on
>> and it can make the right decision. You likely don't need to lower the
>> tdq_load at that time either this way, because sched_pickcpu() can
>> just adjust it locally for its decision.
>>
>> What do you think?
>
> This sounds easy but it is not quite so given the implementation of
> sched_pickcpu and sched_lowest. This is probably more work than I am able to
> take now.
I think actually that the attached patch should do what you need. Of
course there is more runqueue locking, due to the tdq_load fondling,
but I'm not sure it is really a big deal.
I didn't test it yet, as I understand you have a test case, so maybe
you can. However if Jeff agrees I can send the patch to flo@ for
performance evaluation.
Thoughts?
Attilio
Index: sys/sys/sched.h
===================================================================
--- sys/sys/sched.h (revisione 240672)
+++ sys/sys/sched.h (copia locale)
@@ -91,7 +91,7 @@ void sched_nice(struct proc *p, int nice);
*/
void sched_exit_thread(struct thread *td, struct thread *child);
void sched_fork_thread(struct thread *td, struct thread *child);
-void sched_lend_prio(struct thread *td, u_char prio);
+void sched_lend_prio(struct thread *td, u_char prio, int flags);
void sched_lend_user_prio(struct thread *td, u_char pri);
fixpt_t sched_pctcpu(struct thread *td);
void sched_prio(struct thread *td, u_char prio);
@@ -161,6 +161,7 @@ sched_unpin(void)
#define SRQ_INTR 0x0004 /* It is probably urgent. */
#define SRQ_PREEMPTED 0x0008 /* has been
preempted.. be kind */
#define SRQ_BORROWING 0x0010 /* Priority updated
due to prio_lend */
+#define SRQ_WILLSWITCH 0x0020 /* curthread will be
switched out */
/* Scheduler stats. */
#ifdef SCHED_STATS
Index: sys/kern/sched_ule.c
===================================================================
--- sys/kern/sched_ule.c (revisione 240672)
+++ sys/kern/sched_ule.c (copia locale)
@@ -290,7 +290,7 @@ static struct tdq tdq_cpu;
#define TDQ_LOCKPTR(t) (&(t)->tdq_lock)
static void sched_priority(struct thread *);
-static void sched_thread_priority(struct thread *, u_char);
+static void sched_thread_priority(struct thread *, u_char, int flags);
static int sched_interact_score(struct thread *);
static void sched_interact_update(struct thread *);
static void sched_interact_fork(struct thread *);
@@ -1233,6 +1233,18 @@ sched_pickcpu(struct thread *td, int flags)
}
}
/*
+ * If SRQ_WILLSWITCH is set, this means curthread on the currcpu
+ * is going to be switched out very soon. This means we should
+ * consider the curcpu as less loaded than expected.
+ * tdq_load is fondled directly, rather than calling tdq_load_rem(),
+ * because there is no need to handle tdq_sysload for this purpose.
+ */
+ if (flags & SRQ_WILLSWITCH) {
+ TDQ_LOCK(TDQ_CPU(self));
+ TDQ_CPU(self)->tdq_load--;
+ TDQ_UNLOCK(TDQ_CPU(self));
+ }
+ /*
* Search for the last level cache CPU group in the tree.
* Skip caches with expired affinity time and SMT groups.
* Affinity to higher level caches will be handled less aggressively.
@@ -1270,6 +1282,11 @@ sched_pickcpu(struct thread *td, int flags)
cpu = self;
} else
SCHED_STAT_INC(pickcpu_lowest);
+ if (flags & SRQ_WILLSWITCH) {
+ TDQ_LOCK(TDQ_CPU(self));
+ TDQ_CPU(self)->tdq_load++;
+ TDQ_UNLOCK(TDQ_CPU(self));
+ }
if (cpu != ts->ts_cpu)
SCHED_STAT_INC(pickcpu_migration);
return (cpu);
@@ -1629,7 +1646,7 @@ sched_pctcpu_update(struct td_sched *ts, int run)
* functions.
*/
static void
-sched_thread_priority(struct thread *td, u_char prio)
+sched_thread_priority(struct thread *td, u_char prio, int flags)
{
struct td_sched *ts;
struct tdq *tdq;
@@ -1659,7 +1676,7 @@ static void
if (TD_ON_RUNQ(td) && prio < td->td_priority) {
sched_rem(td);
td->td_priority = prio;
- sched_add(td, SRQ_BORROWING);
+ sched_add(td, flags | SRQ_BORROWING);
return;
}
/*
@@ -1684,11 +1701,11 @@ static void
* priority.
*/
void
-sched_lend_prio(struct thread *td, u_char prio)
+sched_lend_prio(struct thread *td, u_char prio, int flags)
{
td->td_flags |= TDF_BORROWING;
- sched_thread_priority(td, prio);
+ sched_thread_priority(td, prio, flags);
}
/*
@@ -1711,9 +1728,9 @@ sched_unlend_prio(struct thread *td, u_char prio)
base_pri = td->td_base_pri;
if (prio >= base_pri) {
td->td_flags &= ~TDF_BORROWING;
- sched_thread_priority(td, base_pri);
+ sched_thread_priority(td, base_pri, 0);
} else
- sched_lend_prio(td, prio);
+ sched_lend_prio(td, prio, 0);
}
/*
@@ -1736,7 +1753,7 @@ sched_prio(struct thread *td, u_char prio)
/* Change the real priority. */
oldprio = td->td_priority;
- sched_thread_priority(td, prio);
+ sched_thread_priority(td, prio, 0);
/*
* If the thread is on a turnstile, then let the turnstile update
Index: sys/kern/sched_4bsd.c
===================================================================
--- sys/kern/sched_4bsd.c (revisione 240672)
+++ sys/kern/sched_4bsd.c (copia locale)
@@ -859,7 +859,7 @@ sched_priority(struct thread *td, u_char prio)
* priority.
*/
void
-sched_lend_prio(struct thread *td, u_char prio)
+sched_lend_prio(struct thread *td, u_char prio, int flags __unused)
{
td->td_flags |= TDF_BORROWING;
@@ -888,7 +888,7 @@ sched_unlend_prio(struct thread *td, u_char prio)
td->td_flags &= ~TDF_BORROWING;
sched_prio(td, base_pri);
} else
- sched_lend_prio(td, prio);
+ sched_lend_prio(td, prio, 0);
}
void
Index: sys/kern/subr_turnstile.c
===================================================================
--- sys/kern/subr_turnstile.c (revisione 240672)
+++ sys/kern/subr_turnstile.c (copia locale)
@@ -158,7 +158,7 @@ static void init_turnstile0(void *dummy);
#ifdef TURNSTILE_PROFILING
static void init_turnstile_profiling(void *arg);
#endif
-static void propagate_priority(struct thread *td);
+static void propagate_priority(struct thread *td, int flags);
static int turnstile_adjust_thread(struct turnstile *ts,
struct thread *td);
static struct thread *turnstile_first_waiter(struct turnstile *ts);
@@ -178,9 +178,10 @@ SDT_PROBE_DEFINE2(sched, , , wakeup, wakeup, "stru
* Walks the chain of turnstiles and their owners to propagate the priority
* of the thread being blocked to all the threads holding locks that have to
* release their locks before this thread can run again.
+ * It accepts SRQ_* informations as flags.
*/
static void
-propagate_priority(struct thread *td)
+propagate_priority(struct thread *td, int flags)
{
struct turnstile *ts;
int pri;
@@ -240,7 +241,7 @@ static void
/*
* Bump this thread's priority.
*/
- sched_lend_prio(td, pri);
+ sched_lend_prio(td, pri, flags);
/*
* If lock holder is actually running or on the run queue
@@ -445,7 +446,7 @@ turnstile_adjust(struct thread *td, u_char oldpri)
td->td_tsqueue == TS_SHARED_QUEUE);
if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
td->td_priority < oldpri) {
- propagate_priority(td);
+ propagate_priority(td, 0);
}
}
@@ -659,7 +660,7 @@ turnstile_claim(struct turnstile *ts)
*/
thread_lock(owner);
if (td->td_priority < owner->td_priority)
- sched_lend_prio(owner, td->td_priority);
+ sched_lend_prio(owner, td->td_priority, 0);
thread_unlock(owner);
tc = TC_LOOKUP(ts->ts_lockobj);
mtx_unlock_spin(&ts->ts_lock);
@@ -741,7 +742,7 @@ turnstile_wait(struct turnstile *ts, struct thread
td->td_blktick = ticks;
TD_SET_LOCK(td);
mtx_unlock_spin(&tc->tc_lock);
- propagate_priority(td);
+ propagate_priority(td, SRQ_WILLSWITCH);
if (LOCK_LOG_TEST(lock, 0))
CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
More information about the freebsd-hackers
mailing list