svn commit: r317137 - in head/sys/compat/linuxkpi/common: include/linux src

Wed Apr 19 13:03:31 UTC 2017

Author: hselasky
Date: Wed Apr 19 13:03:29 2017
New Revision: 317137
URL: https://svnweb.freebsd.org/changeset/base/317137

Log:
  Fix problem regarding priority inversion when using the concurrency
  kit, CK, in the LinuxKPI.
  
  When threads are pinned to a CPU core or when there is only one CPU,
  it can happen that a higher priority thread can call the CK
  synchronize function while a lower priority thread holds the read
  lock. Because the CK's synchronize is a simple wait loop this can lead
  to a deadlock situation. To solve this problem use the recently
  introduced CK's wait callback function.
  
  When detecting a CK blocking condition figure out the lowest priority
  among the blockers and update the calling thread's priority and
  yield. If another CPU core is holding the read lock, pin the thread to
  the blocked CPU core and update the priority. The calling threads
  priority and CPU bindings are restored before return.
  
  If a thread holding a CK read lock is detected to be sleeping, pause()
  will be used instead of yield().
  
  MFC after:		1 week
  Sponsored by:		Mellanox Technologies

Modified:
  head/sys/compat/linuxkpi/common/include/linux/sched.h
  head/sys/compat/linuxkpi/common/include/linux/srcu.h
  head/sys/compat/linuxkpi/common/src/linux_rcu.c

Modified: head/sys/compat/linuxkpi/common/include/linux/sched.h
==============================================================================

--- head/sys/compat/linuxkpi/common/include/linux/sched.h	Wed Apr 19 12:39:45 2017	(r317136)
+++ head/sys/compat/linuxkpi/common/include/linux/sched.h	Wed Apr 19 13:03:29 2017	(r317137)
@@ -37,7 +37,7 @@
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 
-#include <linux/types.h>
+#include <linux/list.h>
 #include <linux/compat.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
@@ -72,6 +72,8 @@ struct task_struct {
 	unsigned bsd_ioctl_len;
 	struct completion parked;
 	struct completion exited;
+	TAILQ_ENTRY(task_struct) rcu_entry;
+	int rcu_recurse;
 };
 
 #define	current	({ \

Modified: head/sys/compat/linuxkpi/common/include/linux/srcu.h
==============================================================================
--- head/sys/compat/linuxkpi/common/include/linux/srcu.h	Wed Apr 19 12:39:45 2017	(r317136)
+++ head/sys/compat/linuxkpi/common/include/linux/srcu.h	Wed Apr 19 13:03:29 2017	(r317137)
@@ -29,9 +29,7 @@
 #ifndef	_LINUX_SRCU_H_
 #define	_LINUX_SRCU_H_
 
-struct srcu_epoch_record;
 struct srcu_struct {
-	struct srcu_epoch_record *ss_epoch_record;
 };
 
 #define	srcu_dereference(ptr,srcu)	((__typeof(*(ptr)) *)(ptr))

Modified: head/sys/compat/linuxkpi/common/src/linux_rcu.c
==============================================================================
--- head/sys/compat/linuxkpi/common/src/linux_rcu.c	Wed Apr 19 12:39:45 2017	(r317136)
+++ head/sys/compat/linuxkpi/common/src/linux_rcu.c	Wed Apr 19 13:03:29 2017	(r317137)
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
+#include <sys/kdb.h>
 
 #include <ck_epoch.h>
 
@@ -45,26 +46,34 @@ __FBSDID("$FreeBSD$");
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/compat.h>
 
-struct callback_head;
-struct writer_epoch_record {
-	ck_epoch_record_t epoch_record;
-	struct mtx head_lock;
-	struct mtx sync_lock;
-	struct task task;
-	STAILQ_HEAD(, callback_head) head;
-} __aligned(CACHE_LINE_SIZE);
+/*
+ * By defining CONFIG_NO_RCU_SKIP LinuxKPI RCU locks and asserts will
+ * not be skipped during panic().
+ */
+#ifdef CONFIG_NO_RCU_SKIP
+#define	RCU_SKIP(void) 0
+#else
+#define	RCU_SKIP(void)	unlikely(SCHEDULER_STOPPED() || kdb_active)
+#endif
 
 struct callback_head {
 	STAILQ_ENTRY(callback_head) entry;
 	rcu_callback_t func;
 };
 
-struct srcu_epoch_record {
+struct linux_epoch_head {
+	STAILQ_HEAD(, callback_head) cb_head;
+	struct mtx lock;
+	struct task task;
+} __aligned(CACHE_LINE_SIZE);
+
+struct linux_epoch_record {
 	ck_epoch_record_t epoch_record;
-	struct mtx read_lock;
-	struct mtx sync_lock;
-};
+	TAILQ_HEAD(, task_struct) ts_head;
+	int cpuid;
+} __aligned(CACHE_LINE_SIZE);
 
 /*
  * Verify that "struct rcu_head" is big enough to hold "struct
@@ -72,57 +81,42 @@ struct srcu_epoch_record {
  * compile flags for including ck_epoch.h to all clients of the
  * LinuxKPI.
  */
-CTASSERT(sizeof(struct rcu_head) >= sizeof(struct callback_head));
+CTASSERT(sizeof(struct rcu_head) == sizeof(struct callback_head));
 
 /*
  * Verify that "epoch_record" is at beginning of "struct
- * writer_epoch_record":
+ * linux_epoch_record":
  */
-CTASSERT(offsetof(struct writer_epoch_record, epoch_record) == 0);
-
-/*
- * Verify that "epoch_record" is at beginning of "struct
- * srcu_epoch_record":
- */
-CTASSERT(offsetof(struct srcu_epoch_record, epoch_record) == 0);
+CTASSERT(offsetof(struct linux_epoch_record, epoch_record) == 0);
 
 static ck_epoch_t linux_epoch;
-static MALLOC_DEFINE(M_LRCU, "lrcu", "Linux RCU");
-static DPCPU_DEFINE(ck_epoch_record_t *, linux_reader_epoch_record);
-static DPCPU_DEFINE(struct writer_epoch_record *, linux_writer_epoch_record);
+static struct linux_epoch_head linux_epoch_head;
+static DPCPU_DEFINE(struct linux_epoch_record, linux_epoch_record);
 
 static void linux_rcu_cleaner_func(void *, int);
 
 static void
 linux_rcu_runtime_init(void *arg __unused)
 {
+	struct linux_epoch_head *head;
 	int i;
 
 	ck_epoch_init(&linux_epoch);
 
-	/* setup reader records */
-	CPU_FOREACH(i) {
-		ck_epoch_record_t *record;
-
-		record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO);
-		ck_epoch_register(&linux_epoch, record, NULL);
+	head = &linux_epoch_head;
 
-		DPCPU_ID_SET(i, linux_reader_epoch_record, record);
-	}
+	mtx_init(&head->lock, "LRCU-HEAD", NULL, MTX_DEF);
+	TASK_INIT(&head->task, 0, linux_rcu_cleaner_func, NULL);
+	STAILQ_INIT(&head->cb_head);
 
-	/* setup writer records */
 	CPU_FOREACH(i) {
-		struct writer_epoch_record *record;
+		struct linux_epoch_record *record;
 
-		record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO);
+		record = &DPCPU_ID_GET(i, linux_epoch_record);
 
+		record->cpuid = i;
 		ck_epoch_register(&linux_epoch, &record->epoch_record, NULL);
-		mtx_init(&record->head_lock, "LRCU-HEAD", NULL, MTX_DEF);
-		mtx_init(&record->sync_lock, "LRCU-SYNC", NULL, MTX_DEF);
-		TASK_INIT(&record->task, 0, linux_rcu_cleaner_func, record);
-		STAILQ_INIT(&record->head);
-
-		DPCPU_ID_SET(i, linux_writer_epoch_record, record);
+		TAILQ_INIT(&record->ts_head);
 	}
 }
 SYSINIT(linux_rcu_runtime, SI_SUB_LOCK, SI_ORDER_SECOND, linux_rcu_runtime_init, NULL);
@@ -130,91 +124,40 @@ SYSINIT(linux_rcu_runtime, SI_SUB_LOCK, 
 static void
 linux_rcu_runtime_uninit(void *arg __unused)
 {
-	ck_stack_entry_t *cursor;
-	ck_stack_entry_t *next;
-	int i;
+	struct linux_epoch_head *head;
 
-	/* make sure all callbacks have been called */
-	linux_rcu_barrier();
+	head = &linux_epoch_head;
 
-	/* destroy all writer record mutexes */
-	CPU_FOREACH(i) {
-		struct writer_epoch_record *record;
-
-		record = DPCPU_ID_GET(i, linux_writer_epoch_record);
-
-		mtx_destroy(&record->head_lock);
-		mtx_destroy(&record->sync_lock);
-	}
-
-	/* free all registered reader and writer records */
-	CK_STACK_FOREACH_SAFE(&linux_epoch.records, cursor, next) {
-		ck_epoch_record_t *record;
-
-		record = container_of(cursor,
-		    struct ck_epoch_record, record_next);
-		free(record, M_LRCU);
-	}
+	/* destroy head lock */
+	mtx_destroy(&head->lock);
 }
 SYSUNINIT(linux_rcu_runtime, SI_SUB_LOCK, SI_ORDER_SECOND, linux_rcu_runtime_uninit, NULL);
 
-static inline struct srcu_epoch_record *
-linux_srcu_get_record(void)
-{
-	struct srcu_epoch_record *record;
-
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-	    "linux_srcu_get_record() might sleep");
-
-	/*
-	 * NOTE: The only records that are unregistered and can be
-	 * recycled are srcu_epoch_records.
-	 */
-	record = (struct srcu_epoch_record *)ck_epoch_recycle(&linux_epoch, NULL);
-	if (__predict_true(record != NULL))
-		return (record);
-
-	record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO);
-	mtx_init(&record->read_lock, "SRCU-READ", NULL, MTX_DEF | MTX_NOWITNESS);
-	mtx_init(&record->sync_lock, "SRCU-SYNC", NULL, MTX_DEF | MTX_NOWITNESS);
-	ck_epoch_register(&linux_epoch, &record->epoch_record, NULL);
-
-	return (record);
-}
-
-static inline void
-linux_rcu_synchronize_sub(struct writer_epoch_record *record)
-{
-
-	/* protect access to epoch_record */
-	mtx_lock(&record->sync_lock);
-	ck_epoch_synchronize(&record->epoch_record);
-	mtx_unlock(&record->sync_lock);
-}
-
 static void
-linux_rcu_cleaner_func(void *context, int pending __unused)
+linux_rcu_cleaner_func(void *context __unused, int pending __unused)
 {
-	struct writer_epoch_record *record;
+	struct linux_epoch_head *head;
 	struct callback_head *rcu;
-	STAILQ_HEAD(, callback_head) head;
+	STAILQ_HEAD(, callback_head) tmp_head;
+
+	linux_set_current(curthread);
 
-	record = context;
+	head = &linux_epoch_head;
 
 	/* move current callbacks into own queue */
-	mtx_lock(&record->head_lock);
-	STAILQ_INIT(&head);
-	STAILQ_CONCAT(&head, &record->head);
-	mtx_unlock(&record->head_lock);
+	mtx_lock(&head->lock);
+	STAILQ_INIT(&tmp_head);
+	STAILQ_CONCAT(&tmp_head, &head->cb_head);
+	mtx_unlock(&head->lock);
 
 	/* synchronize */
-	linux_rcu_synchronize_sub(record);
+	linux_synchronize_rcu();
 
 	/* dispatch all callbacks, if any */
-	while ((rcu = STAILQ_FIRST(&head)) != NULL) {
+	while ((rcu = STAILQ_FIRST(&tmp_head)) != NULL) {
 		uintptr_t offset;
 
-		STAILQ_REMOVE_HEAD(&head, entry);
+		STAILQ_REMOVE_HEAD(&tmp_head, entry);
 
 		offset = (uintptr_t)rcu->func;
 
@@ -228,148 +171,217 @@ linux_rcu_cleaner_func(void *context, in
 void
 linux_rcu_read_lock(void)
 {
-	ck_epoch_record_t *record;
+	struct linux_epoch_record *record;
+	struct task_struct *ts;
+
+	if (RCU_SKIP())
+		return;
 
 	/*
 	 * Pin thread to current CPU so that the unlock code gets the
-	 * same per-CPU reader epoch record:
+	 * same per-CPU epoch record:
 	 */
 	sched_pin();
 
-	record = DPCPU_GET(linux_reader_epoch_record);
+	record = &DPCPU_GET(linux_epoch_record);
+	ts = current;
 
 	/*
 	 * Use a critical section to prevent recursion inside
 	 * ck_epoch_begin(). Else this function supports recursion.
 	 */
 	critical_enter();
-	ck_epoch_begin(record, NULL);
+	ck_epoch_begin(&record->epoch_record, NULL);
+	ts->rcu_recurse++;
+	if (ts->rcu_recurse == 1)
+		TAILQ_INSERT_TAIL(&record->ts_head, ts, rcu_entry);
 	critical_exit();
 }
 
 void
 linux_rcu_read_unlock(void)
 {
-	ck_epoch_record_t *record;
+	struct linux_epoch_record *record;
+	struct task_struct *ts;
+
+	if (RCU_SKIP())
+		return;
 
-	record = DPCPU_GET(linux_reader_epoch_record);
+	record = &DPCPU_GET(linux_epoch_record);
+	ts = current;
 
 	/*
 	 * Use a critical section to prevent recursion inside
 	 * ck_epoch_end(). Else this function supports recursion.
 	 */
 	critical_enter();
-	ck_epoch_end(record, NULL);
+	ck_epoch_end(&record->epoch_record, NULL);
+	ts->rcu_recurse--;
+	if (ts->rcu_recurse == 0)
+		TAILQ_REMOVE(&record->ts_head, ts, rcu_entry);
 	critical_exit();
 
 	sched_unpin();
 }
 
+static void
+linux_synchronize_rcu_cb(ck_epoch_t *epoch __unused, ck_epoch_record_t *epoch_record, void *arg __unused)
+{
+	struct linux_epoch_record *record =
+	    container_of(epoch_record, struct linux_epoch_record, epoch_record);
+	struct thread *td = curthread;
+	struct task_struct *ts;
+
+	/* check if blocked on the current CPU */
+	if (record->cpuid == PCPU_GET(cpuid)) {
+		bool is_sleeping = 0;
+		u_char prio = 0;
+		u_char old_prio;
+
+		/*
+		 * Find the lowest priority or sleeping thread which
+		 * is blocking synchronization on this CPU core. All
+		 * the threads in the queue are CPU-pinned and cannot
+		 * go anywhere while the current thread is locked.
+		 */
+		TAILQ_FOREACH(ts, &record->ts_head, rcu_entry) {
+			if (ts->task_thread->td_priority > prio)
+				prio = ts->task_thread->td_priority;
+			is_sleeping |= (ts->task_thread->td_inhibitors != 0);
+		}
+
+		if (is_sleeping) {
+			thread_unlock(td);
+			pause("W", 1);
+			thread_lock(td);
+		} else {
+			old_prio = td->td_priority;
+			/* set new thread priority */
+			sched_prio(td, prio);
+			/* task switch */
+			mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+			/* restore thread priority */
+			sched_prio(td, old_prio);
+		}
+	} else {
+		/*
+		 * To avoid spinning move execution to the other CPU
+		 * which is blocking synchronization. Set highest
+		 * thread priority so that code gets run. The thread
+		 * priority will be restored later.
+		 */
+		sched_prio(td, 0);
+		sched_bind(td, record->cpuid);
+	}
+}
+
 void
 linux_synchronize_rcu(void)
 {
-	linux_rcu_synchronize_sub(DPCPU_GET(linux_writer_epoch_record));
+	struct thread *td;
+	int was_bound;
+	int old_cpu;
+	int old_pinned;
+
+	if (RCU_SKIP())
+		return;
+
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+	    "linux_synchronize_rcu() can sleep");
+
+	td = curthread;
+
+	DROP_GIANT();
+
+	/*
+	 * Synchronizing RCU might change the CPU core this function
+	 * is running on. Save current values:
+	 */
+	thread_lock(td);
+
+	old_cpu = PCPU_GET(cpuid);
+	old_pinned = td->td_pinned;
+	td->td_pinned = 0;
+	was_bound = sched_is_bound(td);
+	sched_bind(td, old_cpu);
+
+	ck_epoch_synchronize_wait(&linux_epoch,
+	    &linux_synchronize_rcu_cb, NULL);
+
+	/* restore CPU binding, if any */
+	if (was_bound != 0) {
+		sched_bind(td, old_cpu);
+	} else {
+		/* get thread back to initial CPU, if any */
+		if (old_pinned != 0)
+			sched_bind(td, old_cpu);
+		sched_unbind(td);
+	}
+	/* restore pinned after bind */
+	td->td_pinned = old_pinned;
+	thread_unlock(td);
+
+	PICKUP_GIANT();
 }
 
 void
 linux_rcu_barrier(void)
 {
-	int i;
-
-	CPU_FOREACH(i) {
-		struct writer_epoch_record *record;
+	struct linux_epoch_head *head;
 
-		record = DPCPU_ID_GET(i, linux_writer_epoch_record);
+	linux_synchronize_rcu();
 
-		linux_rcu_synchronize_sub(record);
+	head = &linux_epoch_head;
 
-		/* wait for callbacks to complete */
-		taskqueue_drain(taskqueue_fast, &record->task);
-	}
+	/* wait for callbacks to complete */
+	taskqueue_drain(taskqueue_fast, &head->task);
 }
 
 void
 linux_call_rcu(struct rcu_head *context, rcu_callback_t func)
 {
 	struct callback_head *rcu = (struct callback_head *)context;
-	struct writer_epoch_record *record;
-
-	record = DPCPU_GET(linux_writer_epoch_record);
+	struct linux_epoch_head *head = &linux_epoch_head;
 
-	mtx_lock(&record->head_lock);
+	mtx_lock(&head->lock);
 	rcu->func = func;
-	STAILQ_INSERT_TAIL(&record->head, rcu, entry);
-	taskqueue_enqueue(taskqueue_fast, &record->task);
-	mtx_unlock(&record->head_lock);
+	STAILQ_INSERT_TAIL(&head->cb_head, rcu, entry);
+	taskqueue_enqueue(taskqueue_fast, &head->task);
+	mtx_unlock(&head->lock);
 }
 
 int
 init_srcu_struct(struct srcu_struct *srcu)
 {
-	struct srcu_epoch_record *record;
-
-	record = linux_srcu_get_record();
-	srcu->ss_epoch_record = record;
 	return (0);
 }
 
 void
 cleanup_srcu_struct(struct srcu_struct *srcu)
 {
-	struct srcu_epoch_record *record;
-
-	record = srcu->ss_epoch_record;
-	srcu->ss_epoch_record = NULL;
-
-	ck_epoch_unregister(&record->epoch_record);
 }
 
 int
 srcu_read_lock(struct srcu_struct *srcu)
 {
-	struct srcu_epoch_record *record;
-
-	record = srcu->ss_epoch_record;
-
-	mtx_lock(&record->read_lock);
-	ck_epoch_begin(&record->epoch_record, NULL);
-	mtx_unlock(&record->read_lock);
-
+	linux_rcu_read_lock();
 	return (0);
 }
 
 void
 srcu_read_unlock(struct srcu_struct *srcu, int key __unused)
 {
-	struct srcu_epoch_record *record;
-
-	record = srcu->ss_epoch_record;
-
-	mtx_lock(&record->read_lock);
-	ck_epoch_end(&record->epoch_record, NULL);
-	mtx_unlock(&record->read_lock);
+	linux_rcu_read_unlock();
 }
 
 void
 synchronize_srcu(struct srcu_struct *srcu)
 {
-	struct srcu_epoch_record *record;
-
-	record = srcu->ss_epoch_record;
-
-	mtx_lock(&record->sync_lock);
-	ck_epoch_synchronize(&record->epoch_record);
-	mtx_unlock(&record->sync_lock);
+	linux_synchronize_rcu();
 }
 
 void
 srcu_barrier(struct srcu_struct *srcu)
 {
-	struct srcu_epoch_record *record;
-
-	record = srcu->ss_epoch_record;
-
-	mtx_lock(&record->sync_lock);
-	ck_epoch_barrier(&record->epoch_record);
-	mtx_unlock(&record->sync_lock);
+	linux_rcu_barrier();
 }