git: ec25b6fa5ffd - main - LinuxKPI: Reimplement irq_work queue on top of fast taskqueue

Sun Jan 17 11:47:44 UTC 2021

The branch main has been updated by manu:

URL: https://cgit.FreeBSD.org/src/commit/?id=ec25b6fa5ffde89f202c2caa77268ed1eed12158

commit ec25b6fa5ffde89f202c2caa77268ed1eed12158
Author:     Vladimir Kondratyev <wulf at FreeBSD.org>
AuthorDate: 2021-01-17 11:21:49 +0000
Commit:     Emmanuel Vadot <manu at FreeBSD.org>
CommitDate: 2021-01-17 11:47:28 +0000

    LinuxKPI: Reimplement irq_work queue on top of fast taskqueue
    
    Summary:
    Linux's irq_work queue was created for asynchronous execution of code from contexts where spin_lock's are not available like "hardware interrupt context". FreeBSD's fast taskqueues was created for the same purposes.
    
    Drm-kmod 5.4 uses irq_work_queue() at least in one place to schedule execution of task/work from the critical section that triggers following INVARIANTS-induced panic:
    
    ```
    panic: acquiring blockable sleep lock with spinlock or critical section held (sleep mutex) linuxkpi_short_wq @ /usr/src/sys/kern/subr_taskqueue.c:281
    cpuid = 6
    time = 1605048416
    KDB: stack backtrace:
    db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe006b538c90
    vpanic() at vpanic+0x182/frame 0xfffffe006b538ce0
    panic() at panic+0x43/frame 0xfffffe006b538d40
    witness_checkorder() at witness_checkorder+0xf3e/frame 0xfffffe006b538f00
    __mtx_lock_flags() at __mtx_lock_flags+0x94/frame 0xfffffe006b538f50
    taskqueue_enqueue() at taskqueue_enqueue+0x42/frame 0xfffffe006b538f70
    linux_queue_work_on() at linux_queue_work_on+0xe9/frame 0xfffffe006b538fb0
    irq_work_queue() at irq_work_queue+0x21/frame 0xfffffe006b538fd0
    semaphore_notify() at semaphore_notify+0xb2/frame 0xfffffe006b539020
    __i915_sw_fence_notify() at __i915_sw_fence_notify+0x2e/frame 0xfffffe006b539050
    __i915_sw_fence_complete() at __i915_sw_fence_complete+0x63/frame 0xfffffe006b539080
    i915_sw_fence_complete() at i915_sw_fence_complete+0x8e/frame 0xfffffe006b5390c0
    dma_i915_sw_fence_wake() at dma_i915_sw_fence_wake+0x4f/frame 0xfffffe006b539100
    dma_fence_signal_locked() at dma_fence_signal_locked+0x105/frame 0xfffffe006b539180
    dma_fence_signal() at dma_fence_signal+0x72/frame 0xfffffe006b5391c0
    dma_fence_is_signaled() at dma_fence_is_signaled+0x80/frame 0xfffffe006b539200
    dma_resv_add_shared_fence() at dma_resv_add_shared_fence+0xb3/frame 0xfffffe006b539270
    i915_vma_move_to_active() at i915_vma_move_to_active+0x18a/frame 0xfffffe006b5392b0
    eb_move_to_gpu() at eb_move_to_gpu+0x3ad/frame 0xfffffe006b539320
    eb_submit() at eb_submit+0x15/frame 0xfffffe006b539350
    i915_gem_do_execbuffer() at i915_gem_do_execbuffer+0x7d4/frame 0xfffffe006b539570
    i915_gem_execbuffer2_ioctl() at i915_gem_execbuffer2_ioctl+0x1c1/frame 0xfffffe006b539600
    drm_ioctl_kernel() at drm_ioctl_kernel+0xd9/frame 0xfffffe006b539670
    drm_ioctl() at drm_ioctl+0x5cd/frame 0xfffffe006b539820
    linux_file_ioctl() at linux_file_ioctl+0x323/frame 0xfffffe006b539880
    kern_ioctl() at kern_ioctl+0x1f4/frame 0xfffffe006b5398f0
    sys_ioctl() at sys_ioctl+0x12a/frame 0xfffffe006b5399c0
    amd64_syscall() at amd64_syscall+0x121/frame 0xfffffe006b539af0
    fast_syscall_common() at fast_syscall_common+0xf8/frame 0xfffffe006b539af0
    --- syscall (54, FreeBSD ELF64, sys_ioctl), rip = 0x800a6f09a, rsp = 0x7fffffffe588, rbp = 0x7fffffffe640 ---
    KDB: enter: panic
    ```
    Here, the  dma_resv_add_shared_fence() performs a critical_enter() and following call of schedule_work() from semaphore_notify() triggers 'acquiring blockable sleep lock with spinlock or critical section held' panic.
    
    Switching irq_work implementation to fast taskqueue fixes the panic for me.
    
    Other report with the similar bug: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=247166
    
    Reviewed By: hselasky
    Differential Revision: https://reviews.freebsd.org/D27171
---
 .../linuxkpi/common/include/linux/irq_work.h       |  25 ++++-
 sys/compat/linuxkpi/common/include/linux/llist.h   | 101 +++++++++++++++++++++
 sys/compat/linuxkpi/common/include/linux/slab.h    |  16 ++++
 sys/compat/linuxkpi/common/src/linux_slab.c        |  27 ++++++
 sys/compat/linuxkpi/common/src/linux_work.c        |  48 ++++++++++
 5 files changed, 212 insertions(+), 5 deletions(-)

diff --git a/sys/compat/linuxkpi/common/include/linux/irq_work.h b/sys/compat/linuxkpi/common/include/linux/irq_work.h
index b44e78230b0d..eb1798a4e450 100644
--- a/sys/compat/linuxkpi/common/include/linux/irq_work.h
+++ b/sys/compat/linuxkpi/common/include/linux/irq_work.h
@@ -31,22 +31,37 @@
 #ifndef __LINUX_IRQ_WORK_H__
 #define	__LINUX_IRQ_WORK_H__
 
-#include <linux/workqueue.h>
+#include <sys/param.h>
+#include <sys/taskqueue.h>
+
+struct irq_work;
+typedef void (*irq_work_func_t)(struct irq_work *);
 
 struct irq_work {
-	struct work_struct work;
+	struct task irq_task;
+	irq_work_func_t func;
 };
 
+extern struct taskqueue *linux_irq_work_tq;
+
+#define	DEFINE_IRQ_WORK(name, _func)	struct irq_work name = {	\
+	.irq_task = TASK_INITIALIZER(0, linux_irq_work_fn, &(name)),	\
+	.func  = (_func),						\
+}
+
+void	linux_irq_work_fn(void *, int);
+
 static inline void
-init_irq_work(struct irq_work *irqw, void (*func)(struct irq_work *))
+init_irq_work(struct irq_work *irqw, irq_work_func_t func)
 {
-	INIT_WORK(&irqw->work, (work_func_t)func);
+	TASK_INIT(&irqw->irq_task, 0, linux_irq_work_fn, irqw);
+	irqw->func = func;
 }
 
 static inline void
 irq_work_queue(struct irq_work *irqw)
 {
-	schedule_work(&irqw->work);
+	taskqueue_enqueue(linux_irq_work_tq, &irqw->irq_task);
 }
 
 #endif /* __LINUX_IRQ_WORK_H__ */
diff --git a/sys/compat/linuxkpi/common/include/linux/llist.h b/sys/compat/linuxkpi/common/include/linux/llist.h
new file mode 100644
index 000000000000..b3c89516e710
--- /dev/null
+++ b/sys/compat/linuxkpi/common/include/linux/llist.h
@@ -0,0 +1,101 @@
+/* Public domain. */
+
+#ifndef _LINUX_LLIST_H
+#define _LINUX_LLIST_H
+
+#include <sys/types.h>
+#include <machine/atomic.h>
+
+struct llist_node {
+	struct llist_node *next;
+};
+
+struct llist_head {
+	struct llist_node *first;
+};
+
+#define	LLIST_HEAD_INIT(name)	{ NULL }
+#define	LLIST_HEAD(name)	struct llist_head name = LLIST_HEAD_INIT(name)
+
+#define llist_entry(ptr, type, member) \
+	((ptr) ? container_of(ptr, type, member) : NULL)
+
+static inline struct llist_node *
+llist_del_all(struct llist_head *head)
+{
+	return ((void *)atomic_readandclear_ptr((uintptr_t *)&head->first));
+}
+
+static inline struct llist_node *
+llist_del_first(struct llist_head *head)
+{
+	struct llist_node *first, *next;
+
+	do {
+		first = head->first;
+		if (first == NULL)
+			return NULL;
+		next = first->next;
+	} while (atomic_cmpset_ptr((uintptr_t *)&head->first,
+	    (uintptr_t)first, (uintptr_t)next) == 0);
+
+	return (first);
+}
+
+static inline bool
+llist_add(struct llist_node *new, struct llist_head *head)
+{
+	struct llist_node *first;
+
+	do {
+		new->next = first = head->first;
+	} while (atomic_cmpset_ptr((uintptr_t *)&head->first,
+	    (uintptr_t)first, (uintptr_t)new) == 0);
+
+	return (first == NULL);
+}
+
+static inline bool
+llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
+    struct llist_head *head)
+{
+	struct llist_node *first;
+
+	do {
+		new_last->next = first = head->first;
+	} while (atomic_cmpset_ptr((uintptr_t *)&head->first,
+	    (uintptr_t)first, (uintptr_t)new_first) == 0);
+
+	return (first == NULL);
+}
+
+static inline void
+init_llist_head(struct llist_head *head)
+{
+	head->first = NULL;
+}
+
+static inline bool
+llist_empty(struct llist_head *head)
+{
+	return (head->first == NULL);
+}
+
+#define llist_for_each_safe(pos, n, node)				\
+	for ((pos) = (node);						\
+	    (pos) != NULL &&						\
+	    ((n) = (pos)->next, pos);					\
+	    (pos) = (n))
+
+#define llist_for_each_entry_safe(pos, n, node, member) 		\
+	for (pos = llist_entry((node), __typeof(*pos), member); 	\
+	    pos != NULL &&						\
+	    (n = llist_entry(pos->member.next, __typeof(*pos), member), pos); \
+	    pos = n)
+
+#define llist_for_each_entry(pos, node, member)				\
+	for ((pos) = llist_entry((node), __typeof(*(pos)), member);	\
+	    (pos) != NULL;						\
+	    (pos) = llist_entry((pos)->member.next, __typeof(*(pos)), member))
+
+#endif
diff --git a/sys/compat/linuxkpi/common/include/linux/slab.h b/sys/compat/linuxkpi/common/include/linux/slab.h
index ae1c9d81843e..0cd748b7ecb9 100644
--- a/sys/compat/linuxkpi/common/include/linux/slab.h
+++ b/sys/compat/linuxkpi/common/include/linux/slab.h
@@ -35,10 +35,12 @@
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/limits.h>
+#include <sys/proc.h>
 #include <vm/uma.h>
 
 #include <linux/types.h>
 #include <linux/gfp.h>
+#include <linux/llist.h>
 
 MALLOC_DECLARE(M_KMALLOC);
 
@@ -90,6 +92,19 @@ struct linux_kmem_cache {
 #define	ARCH_KMALLOC_MINALIGN \
 	__alignof(unsigned long long)
 
+/*
+ * Critical section-friendly version of kfree().
+ * Requires knowledge of the allocation size at build time.
+ */
+#define kfree_async(ptr)	do {					\
+	_Static_assert(sizeof(*(ptr)) >= sizeof(struct llist_node),	\
+	    "Size of object to free is unknown or too small");		\
+	if (curthread->td_critnest != 0)				\
+		linux_kfree_async(ptr);					\
+	else								\
+		kfree(ptr);						\
+} while (0)
+
 static inline gfp_t
 linux_check_m_flags(gfp_t flags)
 {
@@ -189,5 +204,6 @@ linux_kmem_cache_free(struct linux_kmem_cache *c, void *m)
 }
 
 extern void linux_kmem_cache_destroy(struct linux_kmem_cache *);
+void linux_kfree_async(void *);
 
 #endif					/* _LINUX_SLAB_H_ */
diff --git a/sys/compat/linuxkpi/common/src/linux_slab.c b/sys/compat/linuxkpi/common/src/linux_slab.c
index c8deab01731a..3304c34b1dee 100644
--- a/sys/compat/linuxkpi/common/src/linux_slab.c
+++ b/sys/compat/linuxkpi/common/src/linux_slab.c
@@ -30,6 +30,11 @@ __FBSDID("$FreeBSD$");
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
 #include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
+
+#include <sys/param.h>
+#include <sys/taskqueue.h>
 
 struct linux_kmem_rcu {
 	struct rcu_head rcu_head;
@@ -44,6 +49,8 @@ struct linux_kmem_rcu {
 	((void *)((char *)(r) + sizeof(struct linux_kmem_rcu) - \
 	(r)->cache->cache_size))
 
+static LLIST_HEAD(linux_kfree_async_list);
+
 static int
 linux_kmem_ctor(void *mem, int size, void *arg, int flags)
 {
@@ -126,3 +133,23 @@ linux_kmem_cache_destroy(struct linux_kmem_cache *c)
 	uma_zdestroy(c->cache_zone);
 	free(c, M_KMALLOC);
 }
+
+static void
+linux_kfree_async_fn(void *context, int pending)
+{
+	struct llist_node *freed;
+
+	while((freed = llist_del_first(&linux_kfree_async_list)) != NULL)
+		kfree(freed);
+}
+static struct task linux_kfree_async_task =
+    TASK_INITIALIZER(0, linux_kfree_async_fn, &linux_kfree_async_task);
+
+void
+linux_kfree_async(void *addr)
+{
+	if (addr == NULL)
+		return;
+	llist_add(addr, &linux_kfree_async_list);
+	taskqueue_enqueue(linux_irq_work_tq, &linux_kfree_async_task);
+}
diff --git a/sys/compat/linuxkpi/common/src/linux_work.c b/sys/compat/linuxkpi/common/src/linux_work.c
index 043c5b7d1aff..45025378baa9 100644
--- a/sys/compat/linuxkpi/common/src/linux_work.c
+++ b/sys/compat/linuxkpi/common/src/linux_work.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <linux/compat.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/irq_work.h>
 
 #include <sys/kernel.h>
 
@@ -59,6 +60,8 @@ struct workqueue_struct *system_unbound_wq;
 struct workqueue_struct *system_highpri_wq;
 struct workqueue_struct *system_power_efficient_wq;
 
+struct taskqueue *linux_irq_work_tq;
+
 static int linux_default_wq_cpus = 4;
 
 static void linux_delayed_work_timer_fn(void *);
@@ -683,3 +686,48 @@ linux_work_uninit(void *arg)
 	system_highpri_wq = NULL;
 }
 SYSUNINIT(linux_work_uninit, SI_SUB_TASKQ, SI_ORDER_THIRD, linux_work_uninit, NULL);
+
+void
+linux_irq_work_fn(void *context, int pending)
+{
+	struct irq_work *irqw = context;
+
+	irqw->func(irqw);
+}
+
+static void
+linux_irq_work_init_fn(void *context, int pending)
+{
+	/*
+	 * LinuxKPI performs lazy allocation of memory structures required by
+	 * current on the first access to it.  As some irq_work clients read
+	 * it with spinlock taken, we have to preallocate td_lkpi_task before
+	 * first call to irq_work_queue().  As irq_work uses a single thread,
+	 * it is enough to read current once at SYSINIT stage.
+	 */
+	if (current == NULL)
+		panic("irq_work taskqueue is not initialized");
+}
+static struct task linux_irq_work_init_task =
+    TASK_INITIALIZER(0, linux_irq_work_init_fn, &linux_irq_work_init_task);
+
+static void
+linux_irq_work_init(void *arg)
+{
+	linux_irq_work_tq = taskqueue_create_fast("linuxkpi_irq_wq",
+	    M_WAITOK, taskqueue_thread_enqueue, &linux_irq_work_tq);
+	taskqueue_start_threads(&linux_irq_work_tq, 1, PWAIT,
+	    "linuxkpi_irq_wq");
+	taskqueue_enqueue(linux_irq_work_tq, &linux_irq_work_init_task);
+}
+SYSINIT(linux_irq_work_init, SI_SUB_TASKQ, SI_ORDER_SECOND,
+    linux_irq_work_init, NULL);
+
+static void
+linux_irq_work_uninit(void *arg)
+{
+	taskqueue_drain_all(linux_irq_work_tq);
+	taskqueue_free(linux_irq_work_tq);
+}
+SYSUNINIT(linux_irq_work_uninit, SI_SUB_TASKQ, SI_ORDER_SECOND,
+    linux_irq_work_uninit, NULL);