git: 42736dc44dd0 - main - x86/iommu: Reduce DMAR lock contention

From: Alan Cox <alc_at_FreeBSD.org>
Date: Fri, 29 Jul 2022 05:12:41 UTC
The branch main has been updated by alc:

URL: https://cgit.FreeBSD.org/src/commit/?id=42736dc44dd0151546db3f2e145ae1cfd4546fe1

commit 42736dc44dd0151546db3f2e145ae1cfd4546fe1
Author:     Alan Cox <alc@FreeBSD.org>
AuthorDate: 2022-07-26 06:04:54 +0000
Commit:     Alan Cox <alc@FreeBSD.org>
CommitDate: 2022-07-29 05:11:33 +0000

    x86/iommu: Reduce DMAR lock contention
    
    Replace the DMAR unit's tlb_flush TAILQ by a custom list implementation
    that enables dmar_qi_task() to dequeue entries without holding the DMAR
    lock.
    
    Reviewed by:    kib
    MFC after:      1 week
    Differential Revision:  https://reviews.freebsd.org/D35951
---
 sys/dev/iommu/iommu.h      |   5 +-
 sys/dev/iommu/iommu_gas.c  |   5 +-
 sys/x86/iommu/intel_ctx.c  |  16 +++----
 sys/x86/iommu/intel_dmar.h |  33 +++++++++++--
 sys/x86/iommu/intel_qi.c   | 113 ++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 140 insertions(+), 32 deletions(-)

diff --git a/sys/dev/iommu/iommu.h b/sys/dev/iommu/iommu.h
index 65fefe3ada7b..fefd0f615be5 100644
--- a/sys/dev/iommu/iommu.h
+++ b/sys/dev/iommu/iommu.h
@@ -56,7 +56,10 @@ struct iommu_map_entry {
 	iommu_gaddr_t free_down;	/* Max free space below the
 					   current R/B tree node */
 	u_int flags;
-	TAILQ_ENTRY(iommu_map_entry) dmamap_link; /* Link for dmamap entries */
+	union {
+		TAILQ_ENTRY(iommu_map_entry) dmamap_link; /* DMA map entries */
+		struct iommu_map_entry *tlb_flush_next;
+	};
 	RB_ENTRY(iommu_map_entry) rb_entry;	 /* Links for domain entries */
 	struct iommu_domain *domain;
 	struct iommu_qi_genseq gseq;
diff --git a/sys/dev/iommu/iommu_gas.c b/sys/dev/iommu/iommu_gas.c
index ec456e2ec48b..bac15edcf849 100644
--- a/sys/dev/iommu/iommu_gas.c
+++ b/sys/dev/iommu/iommu_gas.c
@@ -99,7 +99,7 @@ iommu_gas_alloc_entry(struct iommu_domain *domain, u_int flags)
 
 	res = uma_zalloc(iommu_map_entry_zone, ((flags & IOMMU_PGF_WAITOK) !=
 	    0 ? M_WAITOK : M_NOWAIT) | M_ZERO);
-	if (res != NULL) {
+	if (res != NULL && domain != NULL) {
 		res->domain = domain;
 		atomic_add_int(&domain->entries_cnt, 1);
 	}
@@ -113,7 +113,8 @@ iommu_gas_free_entry(struct iommu_domain *domain, struct iommu_map_entry *entry)
 	KASSERT(domain == entry->domain,
 	    ("mismatched free domain %p entry %p entry->domain %p", domain,
 	    entry, entry->domain));
-	atomic_subtract_int(&domain->entries_cnt, 1);
+	if (domain != NULL)
+		atomic_subtract_int(&domain->entries_cnt, 1);
 	uma_zfree(iommu_map_entry_zone, entry);
 }
 
diff --git a/sys/x86/iommu/intel_ctx.c b/sys/x86/iommu/intel_ctx.c
index 936cf8bb7632..3bd425aeecbd 100644
--- a/sys/x86/iommu/intel_ctx.c
+++ b/sys/x86/iommu/intel_ctx.c
@@ -867,6 +867,10 @@ dmar_domain_free_entry(struct iommu_map_entry *entry, bool free)
 		entry->flags = 0;
 }
 
+/*
+ * If the given value for "free" is true, then the caller must not be using
+ * the entry's dmamap_link field.
+ */
 void
 iommu_domain_unload_entry(struct iommu_map_entry *entry, bool free,
     bool cansleep)
@@ -885,10 +889,7 @@ iommu_domain_unload_entry(struct iommu_map_entry *entry, bool free,
 	if (unit->qi_enabled) {
 		if (free) {
 			DMAR_LOCK(unit);
-			dmar_qi_invalidate_locked(domain, entry->start,
-			    entry->end - entry->start, &entry->gseq, true);
-			TAILQ_INSERT_TAIL(&unit->tlb_flush_entries, entry,
-			    dmamap_link);
+			dmar_qi_invalidate_locked(domain, entry, true);
 			DMAR_UNLOCK(unit);
 		} else {
 			dmar_qi_invalidate_sync(domain, entry->start,
@@ -942,12 +943,11 @@ iommu_domain_unload(struct iommu_domain *iodom,
 
 	KASSERT(unit->qi_enabled, ("loaded entry left"));
 	DMAR_LOCK(unit);
-	TAILQ_FOREACH(entry, entries, dmamap_link) {
-		dmar_qi_invalidate_locked(domain, entry->start, entry->end -
-		    entry->start, &entry->gseq,
+	while ((entry = TAILQ_FIRST(entries)) != NULL) {
+		TAILQ_REMOVE(entries, entry, dmamap_link);
+		dmar_qi_invalidate_locked(domain, entry,
 		    dmar_domain_unload_emit_wait(domain, entry));
 	}
-	TAILQ_CONCAT(&unit->tlb_flush_entries, entries, dmamap_link);
 	DMAR_UNLOCK(unit);
 }
 
diff --git a/sys/x86/iommu/intel_dmar.h b/sys/x86/iommu/intel_dmar.h
index 06cecdf704ff..1234ee058ffd 100644
--- a/sys/x86/iommu/intel_dmar.h
+++ b/sys/x86/iommu/intel_dmar.h
@@ -177,8 +177,33 @@ struct dmar_unit {
 	u_int irte_cnt;
 	vmem_t *irtids;
 
-	/* Delayed freeing of map entries queue processing */
-	struct iommu_map_entries_tailq tlb_flush_entries;
+	/*
+	 * Delayed freeing of map entries queue processing:
+	 *
+	 * tlb_flush_head and tlb_flush_tail are used to implement a FIFO
+	 * queue that supports concurrent dequeues and enqueues.  However,
+	 * there can only be a single dequeuer (accessing tlb_flush_head) and
+	 * a single enqueuer (accessing tlb_flush_tail) at a time.  Since the
+	 * unit's qi_task is the only dequeuer, it can access tlb_flush_head
+	 * without any locking.  In contrast, there may be multiple enqueuers,
+	 * so the enqueuers acquire the iommu unit lock to serialize their
+	 * accesses to tlb_flush_tail.
+	 *
+	 * In this FIFO queue implementation, the key to enabling concurrent
+	 * dequeues and enqueues is that the dequeuer never needs to access
+	 * tlb_flush_tail and the enqueuer never needs to access
+	 * tlb_flush_head.  In particular, tlb_flush_head and tlb_flush_tail
+	 * are never NULL, so neither a dequeuer nor an enqueuer ever needs to
+	 * update both.  Instead, tlb_flush_head always points to a "zombie"
+	 * struct, which previously held the last dequeued item.  Thus, the
+	 * zombie's next field actually points to the struct holding the first
+	 * item in the queue.  When an item is dequeued, the current zombie is
+	 * finally freed, and the struct that held the just dequeued item
+	 * becomes the new zombie.  When the queue is empty, tlb_flush_tail
+	 * also points to the zombie.
+	 */
+	struct iommu_map_entry *tlb_flush_head;
+	struct iommu_map_entry *tlb_flush_tail;
 	struct task qi_task;
 	struct taskqueue *qi_taskqueue;
 };
@@ -249,8 +274,8 @@ void dmar_enable_qi_intr(struct dmar_unit *unit);
 void dmar_disable_qi_intr(struct dmar_unit *unit);
 int dmar_init_qi(struct dmar_unit *unit);
 void dmar_fini_qi(struct dmar_unit *unit);
-void dmar_qi_invalidate_locked(struct dmar_domain *domain, iommu_gaddr_t start,
-    iommu_gaddr_t size, struct iommu_qi_genseq *psec, bool emit_wait);
+void dmar_qi_invalidate_locked(struct dmar_domain *domain,
+    struct iommu_map_entry *entry, bool emit_wait);
 void dmar_qi_invalidate_sync(struct dmar_domain *domain, iommu_gaddr_t start,
     iommu_gaddr_t size, bool cansleep);
 void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit);
diff --git a/sys/x86/iommu/intel_qi.c b/sys/x86/iommu/intel_qi.c
index 32f01a2787b0..3a0012763000 100644
--- a/sys/x86/iommu/intel_qi.c
+++ b/sys/x86/iommu/intel_qi.c
@@ -64,10 +64,11 @@ static bool
 dmar_qi_seq_processed(const struct dmar_unit *unit,
     const struct iommu_qi_genseq *pseq)
 {
+	u_int gen;
 
-	return (pseq->gen < unit->inv_waitd_gen ||
-	    (pseq->gen == unit->inv_waitd_gen &&
-	     pseq->seq <= unit->inv_waitd_seq_hw));
+	gen = unit->inv_waitd_gen;
+	return (pseq->gen < gen ||
+	    (pseq->gen == gen && pseq->seq <= unit->inv_waitd_seq_hw));
 }
 
 static int
@@ -131,6 +132,9 @@ dmar_qi_ensure(struct dmar_unit *unit, int descr_count)
 		 * inform the descriptor streamer about entries we
 		 * might have already filled, otherwise they could
 		 * clog the whole queue..
+		 *
+		 * See dmar_qi_invalidate_locked() for a discussion
+		 * about data race prevention.
 		 */
 		dmar_qi_advance_tail(unit);
 		unit->inv_queue_full++;
@@ -201,13 +205,17 @@ dmar_qi_emit_wait_seq(struct dmar_unit *unit, struct iommu_qi_genseq *pseq,
 	}
 }
 
+/*
+ * To avoid missed wakeups, callers must increment the unit's waiters count
+ * before advancing the tail past the wait descriptor.
+ */
 static void
 dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct iommu_qi_genseq *gseq,
     bool nowait)
 {
 
 	DMAR_ASSERT_LOCKED(unit);
-	unit->inv_seq_waiters++;
+	KASSERT(unit->inv_seq_waiters > 0, ("%s: no waiters", __func__));
 	while (!dmar_qi_seq_processed(unit, gseq)) {
 		if (cold || nowait) {
 			cpu_spinwait();
@@ -219,8 +227,8 @@ dmar_qi_wait_for_seq(struct dmar_unit *unit, const struct iommu_qi_genseq *gseq,
 	unit->inv_seq_waiters--;
 }
 
-void
-dmar_qi_invalidate_locked(struct dmar_domain *domain, iommu_gaddr_t base,
+static void
+dmar_qi_invalidate_emit(struct dmar_domain *domain, iommu_gaddr_t base,
     iommu_gaddr_t size, struct iommu_qi_genseq *pseq, bool emit_wait)
 {
 	struct dmar_unit *unit;
@@ -239,6 +247,40 @@ dmar_qi_invalidate_locked(struct dmar_domain *domain, iommu_gaddr_t base,
 		    base | am);
 	}
 	dmar_qi_emit_wait_seq(unit, pseq, emit_wait);
+}
+
+/*
+ * The caller must not be using the entry's dmamap_link field.
+ */
+void
+dmar_qi_invalidate_locked(struct dmar_domain *domain,
+    struct iommu_map_entry *entry, bool emit_wait)
+{
+	struct dmar_unit *unit;
+
+	unit = domain->dmar;
+	DMAR_ASSERT_LOCKED(unit);
+	dmar_qi_invalidate_emit(domain, entry->start, entry->end -
+	    entry->start, &entry->gseq, emit_wait);
+
+	/*
+	 * To avoid a data race in dmar_qi_task(), the entry's gseq must be
+	 * initialized before the entry is added to the TLB flush list, and the
+	 * entry must be added to that list before the tail is advanced.  More
+	 * precisely, the tail must not be advanced past the wait descriptor
+	 * that will generate the interrupt that schedules dmar_qi_task() for
+	 * execution before the entry is added to the list.  While an earlier
+	 * call to dmar_qi_ensure() might have advanced the tail, it will not
+	 * advance it past the wait descriptor.
+	 *
+	 * See the definition of struct dmar_unit for more information on
+	 * synchronization.
+	 */
+	entry->tlb_flush_next = NULL;
+	atomic_store_rel_ptr((uintptr_t *)&unit->tlb_flush_tail->tlb_flush_next,
+	    (uintptr_t)entry);
+	unit->tlb_flush_tail = entry;
+
 	dmar_qi_advance_tail(unit);
 }
 
@@ -251,7 +293,15 @@ dmar_qi_invalidate_sync(struct dmar_domain *domain, iommu_gaddr_t base,
 
 	unit = domain->dmar;
 	DMAR_LOCK(unit);
-	dmar_qi_invalidate_locked(domain, base, size, &gseq, true);
+	dmar_qi_invalidate_emit(domain, base, size, &gseq, true);
+
+	/*
+	 * To avoid a missed wakeup in dmar_qi_task(), the unit's waiters count
+	 * must be incremented before the tail is advanced.
+	 */
+	unit->inv_seq_waiters++;
+
+	dmar_qi_advance_tail(unit);
 	dmar_qi_wait_for_seq(unit, &gseq, !cansleep);
 	DMAR_UNLOCK(unit);
 }
@@ -265,6 +315,8 @@ dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit)
 	dmar_qi_ensure(unit, 2);
 	dmar_qi_emit(unit, DMAR_IQ_DESCR_CTX_INV | DMAR_IQ_DESCR_CTX_GLOB, 0);
 	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	/* See dmar_qi_invalidate_sync(). */
+	unit->inv_seq_waiters++;
 	dmar_qi_advance_tail(unit);
 	dmar_qi_wait_for_seq(unit, &gseq, false);
 }
@@ -279,6 +331,8 @@ dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit)
 	dmar_qi_emit(unit, DMAR_IQ_DESCR_IOTLB_INV | DMAR_IQ_DESCR_IOTLB_GLOB |
 	    DMAR_IQ_DESCR_IOTLB_DW | DMAR_IQ_DESCR_IOTLB_DR, 0);
 	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	/* See dmar_qi_invalidate_sync(). */
+	unit->inv_seq_waiters++;
 	dmar_qi_advance_tail(unit);
 	dmar_qi_wait_for_seq(unit, &gseq, false);
 }
@@ -292,6 +346,8 @@ dmar_qi_invalidate_iec_glob(struct dmar_unit *unit)
 	dmar_qi_ensure(unit, 2);
 	dmar_qi_emit(unit, DMAR_IQ_DESCR_IEC_INV, 0);
 	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	/* See dmar_qi_invalidate_sync(). */
+	unit->inv_seq_waiters++;
 	dmar_qi_advance_tail(unit);
 	dmar_qi_wait_for_seq(unit, &gseq, false);
 }
@@ -316,6 +372,13 @@ dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt)
 	}
 	dmar_qi_ensure(unit, 1);
 	dmar_qi_emit_wait_seq(unit, &gseq, true);
+
+	/*
+	 * Since dmar_qi_wait_for_seq() will not sleep, this increment's
+	 * placement relative to advancing the tail doesn't matter.
+	 */
+	unit->inv_seq_waiters++;
+
 	dmar_qi_advance_tail(unit);
 
 	/*
@@ -352,7 +415,8 @@ static void
 dmar_qi_task(void *arg, int pending __unused)
 {
 	struct dmar_unit *unit;
-	struct iommu_map_entry *entry;
+	struct iommu_domain *domain;
+	struct iommu_map_entry *entry, *head;
 	uint32_t ics;
 
 	unit = arg;
@@ -367,21 +431,33 @@ dmar_qi_task(void *arg, int pending __unused)
 		dmar_write4(unit, DMAR_ICS_REG, ics);
 	}
 
-	DMAR_LOCK(unit);
 	for (;;) {
-		entry = TAILQ_FIRST(&unit->tlb_flush_entries);
+		head = unit->tlb_flush_head;
+		entry = (struct iommu_map_entry *)
+		    atomic_load_acq_ptr((uintptr_t *)&head->tlb_flush_next);
 		if (entry == NULL)
 			break;
 		if (!dmar_qi_seq_processed(unit, &entry->gseq))
 			break;
-		TAILQ_REMOVE(&unit->tlb_flush_entries, entry, dmamap_link);
-		DMAR_UNLOCK(unit);
-		dmar_domain_free_entry(entry, true);
-		DMAR_LOCK(unit);
+		unit->tlb_flush_head = entry;
+		iommu_gas_free_entry(head->domain, head);
+		domain = entry->domain;
+		IOMMU_DOMAIN_LOCK(domain);
+		if ((entry->flags & IOMMU_MAP_ENTRY_RMRR) != 0)
+			iommu_gas_free_region(domain, entry);
+		else
+			iommu_gas_free_space(domain, entry);
+		IOMMU_DOMAIN_UNLOCK(domain);
 	}
-	if (unit->inv_seq_waiters > 0)
+	if (unit->inv_seq_waiters > 0) {
+		/*
+		 * Acquire the DMAR lock so that wakeup() is called only after
+		 * the waiter is sleeping.
+		 */
+		DMAR_LOCK(unit);
 		wakeup(&unit->inv_seq_waiters);
-	DMAR_UNLOCK(unit);
+		DMAR_UNLOCK(unit);
+	}
 }
 
 int
@@ -398,7 +474,8 @@ dmar_init_qi(struct dmar_unit *unit)
 	if (!unit->qi_enabled)
 		return (0);
 
-	TAILQ_INIT(&unit->tlb_flush_entries);
+	unit->tlb_flush_head = unit->tlb_flush_tail =
+            iommu_gas_alloc_entry(NULL, 0);
 	TASK_INIT(&unit->qi_task, 0, dmar_qi_task, unit);
 	unit->qi_taskqueue = taskqueue_create_fast("dmarqf", M_WAITOK,
 	    taskqueue_thread_enqueue, &unit->qi_taskqueue);
@@ -454,6 +531,8 @@ dmar_fini_qi(struct dmar_unit *unit)
 	/* quisce */
 	dmar_qi_ensure(unit, 1);
 	dmar_qi_emit_wait_seq(unit, &gseq, true);
+	/* See dmar_qi_invalidate_sync_locked(). */
+	unit->inv_seq_waiters++;
 	dmar_qi_advance_tail(unit);
 	dmar_qi_wait_for_seq(unit, &gseq, false);
 	/* only after the quisce, disable queue */