svn commit: r367595 - head/sys/vm

Wed Nov 11 17:16:40 UTC 2020

Author: markj
Date: Wed Nov 11 17:16:39 2020
New Revision: 367595
URL: https://svnweb.freebsd.org/changeset/base/367595

Log:
  vm_map: Handle kernel map entry allocator recursion
  
  On platforms without a direct map[*], vm_map_insert() may in rare
  situations need to allocate a kernel map entry in order to allocate
  kernel map entries.  This poses a problem similar to the one solved for
  vmem boundary tags by vmem_bt_alloc().  In fact the kernel map case is a
  bit more complicated since we must allocate entries with the kernel map
  locked, whereas vmem can recurse into itself because boundary tags are
  allocated up-front.
  
  The solution is to add a custom slab allocator for kmapentzone which
  allocates KVA directly from kernel_map, bypassing the kmem_* layer.
  This avoids mutual recursion with the vmem btag allocator.  Then, when
  vm_map_insert() allocates a new kernel map entry, it avoids triggering
  allocation of a new slab with M_NOVM until after the insertion is
  complete.  Instead, vm_map_insert() allocates from the reserve and sets
  a flag in kernel_map to trigger re-population of the reserve just before
  the map is unlocked.  This places an implicit upper bound on the number
  of kernel map entries that may be allocated before the kernel map lock
  is released, but in general a bound of 1 suffices.
  
  [*] This also comes up on amd64 with UMA_MD_SMALL_ALLOC undefined, a
  configuration required by some kernel sanitizers.
  
  Discussed with:	kib, rlibby
  Reported by:	andrew
  Tested by:	pho (i386 and amd64 with !UMA_MD_SMALL_ALLOC)
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D26851

Modified:
  head/sys/vm/vm_map.c
  head/sys/vm/vm_map.h

Modified: head/sys/vm/vm_map.c
==============================================================================

--- head/sys/vm/vm_map.c	Wed Nov 11 15:53:36 2020	(r367594)
+++ head/sys/vm/vm_map.c	Wed Nov 11 17:16:39 2020	(r367595)
@@ -175,29 +175,106 @@ static void vm_map_wire_entry_failure(vm_map_t map, vm
 			start = end;			\
 		}
 
+#ifndef UMA_MD_SMALL_ALLOC
+
 /*
+ * Allocate a new slab for kernel map entries.  The kernel map may be locked or
+ * unlocked, depending on whether the request is coming from the kernel map or a
+ * submap.  This function allocates a virtual address range directly from the
+ * kernel map instead of the kmem_* layer to avoid recursion on the kernel map
+ * lock and also to avoid triggering allocator recursion in the vmem boundary
+ * tag allocator.
+ */
+static void *
+kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
+    int wait)
+{
+	vm_offset_t addr;
+	int error, locked;
+
+	*pflag = UMA_SLAB_PRIV;
+
+	if (!(locked = vm_map_locked(kernel_map)))
+		vm_map_lock(kernel_map);
+	addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
+	if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
+		panic("%s: kernel map is exhausted", __func__);
+	error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
+	    VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+	if (error != KERN_SUCCESS)
+		panic("%s: vm_map_insert() failed: %d", __func__, error);
+	if (!locked)
+		vm_map_unlock(kernel_map);
+	error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
+	    M_USE_RESERVE | (wait & M_ZERO));
+	if (error == KERN_SUCCESS) {
+		return ((void *)addr);
+	} else {
+		if (!locked)
+			vm_map_lock(kernel_map);
+		vm_map_delete(kernel_map, addr, bytes);
+		if (!locked)
+			vm_map_unlock(kernel_map);
+		return (NULL);
+	}
+}
+
+static void
+kmapent_free(void *item, vm_size_t size, uint8_t pflag)
+{
+	vm_offset_t addr;
+	int error;
+
+	if ((pflag & UMA_SLAB_PRIV) == 0)
+		/* XXX leaked */
+		return;
+
+	addr = (vm_offset_t)item;
+	kmem_unback(kernel_object, addr, size);
+	error = vm_map_remove(kernel_map, addr, addr + size);
+	KASSERT(error == KERN_SUCCESS,
+	    ("%s: vm_map_remove failed: %d", __func__, error));
+}
+
+/*
+ * The worst-case upper bound on the number of kernel map entries that may be
+ * created before the zone must be replenished in _vm_map_unlock().
+ */
+#define	KMAPENT_RESERVE		1
+
+#endif /* !UMD_MD_SMALL_ALLOC */
+
+/*
  *	vm_map_startup:
  *
- *	Initialize the vm_map module.  Must be called before
- *	any other vm_map routines.
+ *	Initialize the vm_map module.  Must be called before any other vm_map
+ *	routines.
  *
- *	Map and entry structures are allocated from the general
- *	purpose memory pool with some exceptions:
- *
- *	- The kernel map and kmem submap are allocated statically.
- *	- Kernel map entries are allocated out of a static pool.
- *
- *	These restrictions are necessary since malloc() uses the
- *	maps and requires map entries.
+ *	User map and entry structures are allocated from the general purpose
+ *	memory pool.  Kernel maps are statically defined.  Kernel map entries
+ *	require special handling to avoid recursion; see the comments above
+ *	kmapent_alloc() and in vm_map_entry_create().
  */
-
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
+
+	/*
+	 * Disable the use of per-CPU buckets: map entry allocation is
+	 * serialized by the kernel map lock.
+	 */
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
-	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
+	    UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
+#ifndef UMA_MD_SMALL_ALLOC
+	/* Reserve an extra map entry for use when replenishing the reserve. */
+	uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
+	uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
+	uma_zone_set_allocf(kmapentzone, kmapent_alloc);
+	uma_zone_set_freef(kmapentzone, kmapent_free);
+#endif
+
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
@@ -616,9 +693,15 @@ _vm_map_unlock(vm_map_t map, const char *file, int lin
 {
 
 	VM_MAP_UNLOCK_CONSISTENT(map);
-	if (map->system_map)
+	if (map->system_map) {
+#ifndef UMA_MD_SMALL_ALLOC
+		if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
+			uma_prealloc(kmapentzone, 1);
+			map->flags &= ~MAP_REPLENISH;
+		}
+#endif
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
-	else {
+	} else {
 		sx_xunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
@@ -638,9 +721,11 @@ void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
-	if (map->system_map)
+	if (map->system_map) {
+		KASSERT((map->flags & MAP_REPLENISH) == 0,
+		    ("%s: MAP_REPLENISH leaked", __func__));
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
-	else {
+	} else {
 		sx_sunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
@@ -712,6 +797,8 @@ _vm_map_lock_downgrade(vm_map_t map, const char *file,
 {
 
 	if (map->system_map) {
+		KASSERT((map->flags & MAP_REPLENISH) == 0,
+		    ("%s: MAP_REPLENISH leaked", __func__));
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		VM_MAP_UNLOCK_CONSISTENT(map);
@@ -755,10 +842,13 @@ _vm_map_unlock_and_wait(vm_map_t map, int timo, const 
 
 	VM_MAP_UNLOCK_CONSISTENT(map);
 	mtx_lock(&map_sleep_mtx);
-	if (map->system_map)
+	if (map->system_map) {
+		KASSERT((map->flags & MAP_REPLENISH) == 0,
+		    ("%s: MAP_REPLENISH leaked", __func__));
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
-	else
+	} else {
 		sx_xunlock_(&map->lock, file, line);
+	}
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
 	    timo));
 }
@@ -881,12 +971,33 @@ vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
-	if (map->system_map)
+#ifndef UMA_MD_SMALL_ALLOC
+	if (map == kernel_map) {
+		VM_MAP_ASSERT_LOCKED(map);
+
+		/*
+		 * A new slab of kernel map entries cannot be allocated at this
+		 * point because the kernel map has not yet been updated to
+		 * reflect the caller's request.  Therefore, we allocate a new
+		 * map entry, dipping into the reserve if necessary, and set a
+		 * flag indicating that the reserve must be replenished before
+		 * the map is unlocked.
+		 */
+		new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
+		if (new_entry == NULL) {
+			new_entry = uma_zalloc(kmapentzone,
+			    M_NOWAIT | M_NOVM | M_USE_RESERVE);
+			kernel_map->flags |= MAP_REPLENISH;
+		}
+	} else
+#endif
+	if (map->system_map) {
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
-	else
+	} else {
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
-	if (new_entry == NULL)
-		panic("vm_map_entry_create: kernel resources exhausted");
+	}
+	KASSERT(new_entry != NULL,
+	    ("vm_map_entry_create: kernel resources exhausted"));
 	return (new_entry);
 }
 
@@ -1771,6 +1882,8 @@ vm_map_findspace(vm_map_t map, vm_offset_t start, vm_s
 	vm_map_entry_t header, llist, rlist, root, y;
 	vm_size_t left_length, max_free_left, max_free_right;
 	vm_offset_t gap_end;
+
+	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Request must fit within min/max VM address and must avoid

Modified: head/sys/vm/vm_map.h
==============================================================================
--- head/sys/vm/vm_map.h	Wed Nov 11 15:53:36 2020	(r367594)
+++ head/sys/vm/vm_map.h	Wed Nov 11 17:16:39 2020	(r367595)
@@ -227,6 +227,7 @@ struct vm_map {
 #define	MAP_IS_SUB_MAP		0x04	/* has parent */
 #define	MAP_ASLR		0x08	/* enabled ASLR */
 #define	MAP_ASLR_IGNSTART	0x10
+#define	MAP_REPLENISH		0x20
 
 #ifdef	_KERNEL
 #if defined(KLD_MODULE) && !defined(KLD_TIED)