git: c3d8a93126b9 - main - Re-implement rangelocks part 1

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Tue, 06 Aug 2024 04:06:34 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=c3d8a93126b9dd05fcfed3685bc3817f3c1eccc9

commit c3d8a93126b9dd05fcfed3685bc3817f3c1eccc9
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2023-08-07 04:45:01 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2024-08-06 04:05:58 +0000

    Re-implement rangelocks part 1
    
    Using the algorithms from https://doi.org/10.1145/3342195.3387533.
    
    For the first part, consider all range lock requests as exclusive.
    
    Reviewed by:    markj, Olivier Certner <olce.freebsd@certner.fr>
    Tested by:      pho
    Sponsored by:   The FreeBSD Foundation
    Differential revision:  https://reviews.freebsd.org/D41787
---
 sys/kern/kern_rangelock.c | 455 +++++++++++++++++++++++++---------------------
 sys/kern/kern_thread.c    |   1 -
 sys/kern/uipc_shm.c       |   8 +-
 sys/kern/vfs_subr.c       |   2 -
 sys/sys/rangelock.h       |  45 ++---
 sys/sys/vnode.h           |  13 +-
 6 files changed, 267 insertions(+), 257 deletions(-)

diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
index a9d8c342c267..186a42caebf0 100644
--- a/sys/kern/kern_rangelock.c
+++ b/sys/kern/kern_rangelock.c
@@ -27,304 +27,335 @@
  */
 
 #include <sys/param.h>
+#include <sys/kassert.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangelock.h>
-#include <sys/systm.h>
+#include <sys/sleepqueue.h>
+#include <sys/smr.h>
 
 #include <vm/uma.h>
 
+/*
+ * Implementation of range locks based on the paper
+ * https://doi.org/10.1145/3342195.3387533
+ * arXiv:2006.12144v1 [cs.OS] 22 Jun 2020
+ * Scalable Range Locks for Scalable Address Spaces and Beyond
+ * by Alex Kogan, Dave Dice, and Shady Issa
+ */
+
+static struct rl_q_entry *rl_e_unmark(const struct rl_q_entry *e);
+
+/*
+ * rl_q_next links all granted ranges in the lock.  We cannot free an
+ * rl_q_entry while in the smr section, and cannot reuse rl_q_next
+ * linkage since other threads might follow it even after CAS removed
+ * the range.  Use rl_q_free for local list of ranges to remove after
+ * the smr section is dropped.
+ */
 struct rl_q_entry {
-	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	struct rl_q_entry *rl_q_next;
+	struct rl_q_entry *rl_q_free;
 	off_t		rl_q_start, rl_q_end;
 	int		rl_q_flags;
+#ifdef INVARIANTS
+	struct thread	*rl_q_owner;
+#endif
 };
 
 static uma_zone_t rl_entry_zone;
+static smr_t rl_smr;
 
 static void
 rangelock_sys_init(void)
 {
-
 	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry),
+	    UMA_ZONE_SMR);
+	rl_smr = uma_zone_get_smr(rl_entry_zone);
 }
-SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+SYSINIT(rl, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
 
 static struct rl_q_entry *
-rlqentry_alloc(void)
+rlqentry_alloc(vm_ooffset_t start, vm_ooffset_t end, int flags)
 {
+	struct rl_q_entry *e;
+
+	e = uma_zalloc_smr(rl_entry_zone, M_WAITOK);
+	e->rl_q_next = NULL;
+	e->rl_q_free = NULL;
+	e->rl_q_start = start;
+	e->rl_q_end = end;
+	e->rl_q_flags = flags;
+#ifdef INVARIANTS
+	e->rl_q_owner = curthread;
+#endif
+	return (e);
+}
 
-	return (uma_zalloc(rl_entry_zone, M_WAITOK));
+void
+rangelock_init(struct rangelock *lock)
+{
+	lock->sleepers = false;
+	atomic_store_ptr(&lock->head, NULL);
 }
 
 void
-rlqentry_free(struct rl_q_entry *rleq)
+rangelock_destroy(struct rangelock *lock)
 {
+	struct rl_q_entry *e, *ep;
 
-	uma_zfree(rl_entry_zone, rleq);
+	MPASS(!lock->sleepers);
+	for (e = (struct rl_q_entry *)atomic_load_ptr(&lock->head);
+	    e != NULL; e = rl_e_unmark(ep)) {
+		ep = atomic_load_ptr(&e->rl_q_next);
+		uma_zfree_smr(rl_entry_zone, e);
+	}
 }
 
-void
-rangelock_init(struct rangelock *lock)
+static bool
+rl_e_is_marked(const struct rl_q_entry *e)
 {
+	return (((uintptr_t)e & 1) != 0);
+}
 
-	TAILQ_INIT(&lock->rl_waiters);
-	lock->rl_currdep = NULL;
+static struct rl_q_entry *
+rl_e_unmark(const struct rl_q_entry *e)
+{
+	MPASS(rl_e_is_marked(e));
+	return ((struct rl_q_entry *)((uintptr_t)e & ~1));
 }
 
-void
-rangelock_destroy(struct rangelock *lock)
+static struct rl_q_entry *
+rl_q_load(struct rl_q_entry **p)
 {
+	return ((struct rl_q_entry *)atomic_load_acq_ptr((uintptr_t *)p));
+}
 
-	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+void
+rangelock_unlock(struct rangelock *lock, void *cookie)
+{
+	struct rl_q_entry *e;
+
+	e = cookie;
+	MPASS(lock != NULL && e != NULL);
+	MPASS(!rl_e_is_marked(rl_q_load(&e->rl_q_next)));
+	MPASS(e->rl_q_owner == curthread);
+
+	sleepq_lock(&lock->sleepers);
+#ifdef INVARIANTS
+	int r = atomic_testandset_long((uintptr_t *)&e->rl_q_next, 0);
+	MPASS(r == 0);
+#else
+	atomic_set_ptr((uintptr_t *)&e->rl_q_next, 1);
+#endif
+	lock->sleepers = false;
+	sleepq_broadcast(&lock->sleepers, SLEEPQ_SLEEP, 0, 0);
+	sleepq_release(&lock->sleepers);
 }
 
 /*
- * Two entries are compatible if their ranges do not overlap, or both
- * entries are for read.
+ * result: -1 if e1 before e2
+ *          1 if e1 after e2
+ *          0 if e1 and e2 overlap
  */
 static int
-ranges_overlap(const struct rl_q_entry *e1,
-    const struct rl_q_entry *e2)
+rl_e_compare(const struct rl_q_entry *e1, const struct rl_q_entry *e2)
 {
-
-	if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
+	if (e1 == NULL)
 		return (1);
+	if (e1->rl_q_start >= e2->rl_q_end)
+		return (1);
+	if (e2->rl_q_start >= e1->rl_q_end)
+		return (-1);
 	return (0);
 }
 
-/*
- * Recalculate the lock->rl_currdep after an unlock.
- */
 static void
-rangelock_calc_block(struct rangelock *lock)
+rl_insert_sleep(struct rangelock *lock)
 {
-	struct rl_q_entry *entry, *nextentry, *entry1;
-
-	for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
-		nextentry = TAILQ_NEXT(entry, rl_q_link);
-		if (entry->rl_q_flags & RL_LOCK_READ) {
-			/* Reads must not overlap with granted writes. */
-			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
-			    !(entry1->rl_q_flags & RL_LOCK_READ);
-			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
-				if (ranges_overlap(entry, entry1))
-					goto out;
-			}
-		} else {
-			/* Write must not overlap with any granted locks. */
-			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
-			    entry1 != entry;
-			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
-				if (ranges_overlap(entry, entry1))
-					goto out;
-			}
-
-			/* Move grantable write locks to the front. */
-			TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
-			TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
-		}
-
-		/* Grant this lock. */
-		entry->rl_q_flags |= RL_LOCK_GRANTED;
-		wakeup(entry);
-	}
-out:
-	lock->rl_currdep = entry;
+	smr_exit(rl_smr);
+	DROP_GIANT();
+	lock->sleepers = true;
+	sleepq_add(&lock->sleepers, NULL, "rangelk", 0, 0);
+	sleepq_wait(&lock->sleepers, PRI_USER);
+	PICKUP_GIANT();
+	smr_enter(rl_smr);
 }
 
-static void
-rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
-    struct mtx *ilk, bool do_calc_block)
+static bool
+rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old,
+    struct rl_q_entry *new)
 {
-
-	MPASS(lock != NULL && entry != NULL && ilk != NULL);
-	mtx_assert(ilk, MA_OWNED);
-
-	if (!do_calc_block) {
-		/*
-		 * This is the case where rangelock_enqueue() has been called
-		 * with trylock == true and just inserted this entry in the
-		 * queue.
-		 * If rl_currdep is this entry, rl_currdep needs to
-		 * be set to the next entry in the rl_waiters list.
-		 * However, since this entry is the last entry in the
-		 * list, the next entry is NULL.
-		 */
-		if (lock->rl_currdep == entry) {
-			KASSERT(TAILQ_NEXT(lock->rl_currdep, rl_q_link) == NULL,
-			    ("rangelock_enqueue: next entry not NULL"));
-			lock->rl_currdep = NULL;
-		}
-	} else
-		KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
-
-	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
-	if (do_calc_block)
-		rangelock_calc_block(lock);
-	mtx_unlock(ilk);
-	if (curthread->td_rlqe == NULL)
-		curthread->td_rlqe = entry;
-	else
-		rlqentry_free(entry);
+	return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old,
+	    (uintptr_t)new) != 0);
 }
 
-void
-rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+static bool
+rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
+    struct rl_q_entry **free)
 {
+	struct rl_q_entry *cur, *next, **prev;
+	int r;
+
+again:
+	prev = &lock->head;
+	if (rl_q_load(prev) == NULL && rl_q_cas(prev, NULL, e))
+		return (true);
+
+	for (cur = rl_q_load(prev);;) {
+		if (rl_e_is_marked(cur))
+			goto again;
+
+		if (cur != NULL) {
+			next = rl_q_load(&cur->rl_q_next);
+			if (rl_e_is_marked(next)) {
+				next = rl_e_unmark(next);
+				if (rl_q_cas(prev, cur, next)) {
+#ifdef INVARIANTS
+					cur->rl_q_owner = NULL;
+#endif
+					cur->rl_q_free = *free;
+					*free = cur;
+				}
+				cur = next;
+				continue;
+			}
+		}
 
-	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
-
-	mtx_lock(ilk);
-	rangelock_unlock_locked(lock, cookie, ilk, true);
-}
-
-/*
- * Unlock the sub-range of granted lock.
- */
-void *
-rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
-    off_t end, struct mtx *ilk)
-{
-	struct rl_q_entry *entry;
-
-	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
-	entry = cookie;
-	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
-	    ("Unlocking non-granted lock"));
-	KASSERT(entry->rl_q_start == start, ("wrong start"));
-	KASSERT(entry->rl_q_end >= end, ("wrong end"));
-
-	mtx_lock(ilk);
-	if (entry->rl_q_end == end) {
-		rangelock_unlock_locked(lock, cookie, ilk, true);
-		return (NULL);
+		r = rl_e_compare(cur, e);
+		if (r == -1) {
+			prev = &cur->rl_q_next;
+			cur = rl_q_load(prev);
+		} else if (r == 0) {
+			sleepq_lock(&lock->sleepers);
+			if (__predict_false(rl_e_is_marked(rl_q_load(
+			    &cur->rl_q_next)))) {
+				sleepq_release(&lock->sleepers);
+				continue;
+			}
+			if (trylock) {
+				sleepq_release(&lock->sleepers);
+				return (false);
+			}
+			rl_insert_sleep(lock);
+			/* e is still valid */
+			goto again;
+		} else /* r == 1 */ {
+			e->rl_q_next = cur;
+			if (rl_q_cas(prev, cur, e)) {
+				atomic_thread_fence_acq();
+				return (true);
+			}
+			/* Reset rl_q_next in case we hit fast path. */
+			e->rl_q_next = NULL;
+			cur = rl_q_load(prev);
+		}
 	}
-	entry->rl_q_end = end;
-	rangelock_calc_block(lock);
-	mtx_unlock(ilk);
-	return (cookie);
 }
 
-/*
- * Add the lock request to the queue of the pending requests for
- * rangelock.  Sleep until the request can be granted unless trylock == true.
- */
-static void *
-rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
-    struct mtx *ilk, bool trylock)
+static struct rl_q_entry *
+rangelock_lock_int(struct rangelock *lock, struct rl_q_entry *e,
+    bool trylock)
 {
-	struct rl_q_entry *entry;
-	struct thread *td;
-
-	MPASS(lock != NULL && ilk != NULL);
-
-	td = curthread;
-	if (td->td_rlqe != NULL) {
-		entry = td->td_rlqe;
-		td->td_rlqe = NULL;
-	} else
-		entry = rlqentry_alloc();
-	MPASS(entry != NULL);
-	entry->rl_q_flags = mode;
-	entry->rl_q_start = start;
-	entry->rl_q_end = end;
-
-	mtx_lock(ilk);
-	/*
-	 * XXXKIB TODO. Check that a thread does not try to enqueue a
-	 * lock that is incompatible with another request from the same
-	 * thread.
-	 */
-
-	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
-	/*
-	 * If rl_currdep == NULL, there is no entry waiting for a conflicting
-	 * range to be resolved, so set rl_currdep to this entry.  If there is
-	 * no conflicting entry for this entry, rl_currdep will be set back to
-	 * NULL by rangelock_calc_block().
-	 */
-	if (lock->rl_currdep == NULL)
-		lock->rl_currdep = entry;
-	rangelock_calc_block(lock);
-	while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) {
-		if (trylock) {
-			/*
-			 * For this case, the range is not actually locked
-			 * yet, but removal from the list requires the same
-			 * steps, except for not doing a rangelock_calc_block()
-			 * call, since rangelock_calc_block() was called above.
-			 */
-			rangelock_unlock_locked(lock, entry, ilk, false);
-			return (NULL);
-		}
-		msleep(entry, ilk, 0, "range", 0);
+	struct rl_q_entry *free, *x, *xp;
+	bool res;
+
+	free = NULL;
+	smr_enter(rl_smr);
+	res = rl_insert(lock, e, trylock, &free);
+	smr_exit(rl_smr);
+	MPASS(trylock || res);
+	if (!res) {
+		e->rl_q_free = free;
+		free = e;
+		e = NULL;
+	}
+	for (x = free; x != NULL; x = xp) {
+		MPASS(!rl_e_is_marked(x));
+		xp = x->rl_q_free;
+		MPASS(!rl_e_is_marked(xp));
+		uma_zfree_smr(rl_entry_zone, x);
 	}
-	mtx_unlock(ilk);
-	return (entry);
+	return (e);
 }
 
 void *
-rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
 {
+	struct rl_q_entry *e;
 
-	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, false));
+	e = rlqentry_alloc(start, end, RL_LOCK_READ);
+	return (rangelock_lock_int(lock, e, false));
 }
 
 void *
-rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
-    struct mtx *ilk)
+rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
 {
+	struct rl_q_entry *e;
 
-	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, true));
+	e = rlqentry_alloc(start, end, RL_LOCK_READ);
+	return (rangelock_lock_int(lock, e, true));
 }
 
 void *
-rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
 {
+	struct rl_q_entry *e;
 
-	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, false));
+	e = rlqentry_alloc(start, end, RL_LOCK_WRITE);
+	return (rangelock_lock_int(lock, e, true));
 }
 
 void *
-rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
-    struct mtx *ilk)
+rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
 {
+	struct rl_q_entry *e;
 
-	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, true));
+	e = rlqentry_alloc(start, end, RL_LOCK_WRITE);
+	return (rangelock_lock_int(lock, e, true));
 }
 
 #ifdef INVARIANT_SUPPORT
 void
 _rangelock_cookie_assert(void *cookie, int what, const char *file, int line)
 {
-	struct rl_q_entry *entry;
-	int flags;
-
-	MPASS(cookie != NULL);
-	entry = cookie;
-	flags = entry->rl_q_flags;
-	switch (what) {
-	case RCA_LOCKED:
-		if ((flags & RL_LOCK_GRANTED) == 0)
-			panic("rangelock not held @ %s:%d\n", file, line);
-		break;
-	case RCA_RLOCKED:
-		if ((flags & (RL_LOCK_GRANTED | RL_LOCK_READ)) !=
-		    (RL_LOCK_GRANTED | RL_LOCK_READ))
-			panic("rangelock not rlocked @ %s:%d\n", file, line);
-		break;
-	case RCA_WLOCKED:
-		if ((flags & (RL_LOCK_GRANTED | RL_LOCK_WRITE)) !=
-		    (RL_LOCK_GRANTED | RL_LOCK_WRITE))
-			panic("rangelock not wlocked @ %s:%d\n", file, line);
-		break;
-	default:
-		panic("Unknown rangelock assertion: %d @ %s:%d", what, file,
-		    line);
-	}
 }
 #endif	/* INVARIANT_SUPPORT */
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(rangelock, db_show_rangelock)
+{
+	struct rangelock *lock;
+	struct rl_q_entry *e, *x;
+
+	if (!have_addr) {
+		db_printf("show rangelock addr\n");
+		return;
+	}
+
+	lock = (struct rangelock *)addr;
+	db_printf("rangelock %p sleepers %d\n", lock, lock->sleepers);
+	for (e = lock->head;;) {
+		x = rl_e_is_marked(e) ? rl_e_unmark(e) : e;
+		if (x == NULL)
+			break;
+		db_printf("  entry %p marked %d %d start %#jx end %#jx "
+		    "flags %x next %p",
+		    e, rl_e_is_marked(e), rl_e_is_marked(x->rl_q_next),
+		    x->rl_q_start, x->rl_q_end, x->rl_q_flags, x->rl_q_next);
+#ifdef INVARIANTS
+		db_printf(" owner %p (%d)", x->rl_q_owner,
+		    x->rl_q_owner != NULL ? x->rl_q_owner->td_tid : -1);
+#endif
+		db_printf("\n");
+		e = x->rl_q_next;
+	}
+}
+
+#endif	/* DDB */
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 00f99516773c..c951e7297c89 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -478,7 +478,6 @@ thread_fini(void *mem, int size)
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
-	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 7672ded459df..49b5b56dee17 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -184,13 +184,13 @@ SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries,
     "Number of contig reclaims before giving up for default alloc policy");
 
 #define	shm_rangelock_unlock(shmfd, cookie)				\
-	rangelock_unlock(&(shmfd)->shm_rl, (cookie), &(shmfd)->shm_mtx)
+	rangelock_unlock(&(shmfd)->shm_rl, (cookie))
 #define	shm_rangelock_rlock(shmfd, start, end)				\
-	rangelock_rlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+	rangelock_rlock(&(shmfd)->shm_rl, (start), (end))
 #define	shm_rangelock_tryrlock(shmfd, start, end)			\
-	rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+	rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end))
 #define	shm_rangelock_wlock(shmfd, start, end)				\
-	rangelock_wlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx)
+	rangelock_wlock(&(shmfd)->shm_rl, (start), (end))
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 8012fab29081..f192c6798858 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -2180,8 +2180,6 @@ freevnode(struct vnode *vp)
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 	    ("dirty blk trie not empty"));
-	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
-	    ("Dangling rangelock waiters"));
 	VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
 	    ("Leaked inactivation"));
 	VI_UNLOCK(vp);
diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h
index 62ccf77c03bc..310371bef879 100644
--- a/sys/sys/rangelock.h
+++ b/sys/sys/rangelock.h
@@ -29,12 +29,14 @@
 #ifndef	_SYS_RANGELOCK_H
 #define	_SYS_RANGELOCK_H
 
-#include <sys/queue.h>
+#include <sys/types.h>
+#ifndef _KERNEL
+#include <stdbool.h>
+#endif
 
 #define	RL_LOCK_READ		0x0001
 #define	RL_LOCK_WRITE		0x0002
 #define	RL_LOCK_TYPE_MASK	0x0003
-#define	RL_LOCK_GRANTED		0x0004
 
 struct rl_q_entry;
 
@@ -44,42 +46,25 @@ struct rl_q_entry;
  * all existing lock owners are compatible with the request. Two lock
  * owners are compatible if their ranges do not overlap, or both
  * owners are for read.
- *
- * Access to the structure itself is synchronized with the externally
- * supplied mutex.
- *
- * rl_waiters is the queue containing in order (a) granted write lock
- * requests, (b) granted read lock requests, and (c) in order of arrival,
- * lock requests which cannot be granted yet.
- *
- * rl_currdep is the first lock request that cannot be granted now due
- * to the preceding requests conflicting with it (i.e., it points to
- * position (c) in the list above).
  */
 struct rangelock {
-	TAILQ_HEAD(, rl_q_entry) rl_waiters;
-	struct rl_q_entry	*rl_currdep;
+	struct rl_q_entry *head;
+	bool sleepers;
 };
 
 #ifdef _KERNEL
 
-struct mtx;
-
 void	 rangelock_init(struct rangelock *lock);
 void	 rangelock_destroy(struct rangelock *lock);
-void	 rangelock_unlock(struct rangelock *lock, void *cookie,
-	    struct mtx *ilk);
-void	*rangelock_unlock_range(struct rangelock *lock, void *cookie,
-	    off_t start, off_t end, struct mtx *ilk);
-void	*rangelock_rlock(struct rangelock *lock, off_t start, off_t end,
-	    struct mtx *ilk);
-void	*rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
-	    struct mtx *ilk);
-void	*rangelock_wlock(struct rangelock *lock, off_t start, off_t end,
-	    struct mtx *ilk);
-void	*rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
-	    struct mtx *ilk);
-void	 rlqentry_free(struct rl_q_entry *rlqe);
+void	 rangelock_unlock(struct rangelock *lock, void *cookie);
+void	*rangelock_rlock(struct rangelock *lock, vm_ooffset_t start,
+    vm_ooffset_t end);
+void	*rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start,
+    vm_ooffset_t end);
+void	*rangelock_wlock(struct rangelock *lock, vm_ooffset_t start,
+    vm_ooffset_t end);
+void	*rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start,
+    vm_ooffset_t end);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	_rangelock_cookie_assert(void *cookie, int what, const char *file,
     int line);
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index dc926d7a9c9e..0d0f228f7051 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -833,18 +833,15 @@ void	vn_seqc_write_end(struct vnode *vp);
 #define	vn_seqc_consistent(vp, seq)	seqc_consistent(&(vp)->v_seqc, seq)
 
 #define	vn_rangelock_unlock(vp, cookie)					\
-	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
-#define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
-	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
-	    VI_MTX(vp))
+	rangelock_unlock(&(vp)->v_rl, (cookie))
 #define	vn_rangelock_rlock(vp, start, end)				\
-	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+	rangelock_rlock(&(vp)->v_rl, (start), (end))
 #define	vn_rangelock_tryrlock(vp, start, end)				\
-	rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+	rangelock_tryrlock(&(vp)->v_rl, (start), (end))
 #define	vn_rangelock_wlock(vp, start, end)				\
-	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+	rangelock_wlock(&(vp)->v_rl, (start), (end))
 #define	vn_rangelock_trywlock(vp, start, end)				\
-	rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
+	rangelock_trywlock(&(vp)->v_rl, (start), (end))
 
 #define	vn_irflag_read(vp)	atomic_load_short(&(vp)->v_irflag)
 void	vn_irflag_set_locked(struct vnode *vp, short toset);