git: 2619c5ccfe1f - main - Avoid waiting on physical allocations that can't possibly be satisfied

From: Jason A. Harmening <jah_at_FreeBSD.org>
Date: Sun, 24 Dec 2023 05:39:36 UTC
The branch main has been updated by jah:

URL: https://cgit.FreeBSD.org/src/commit/?id=2619c5ccfe1f7889f0241916bd17d06340142b05

commit 2619c5ccfe1f7889f0241916bd17d06340142b05
Author:     Jason A. Harmening <jah@FreeBSD.org>
AuthorDate: 2023-11-20 23:23:58 +0000
Commit:     Jason A. Harmening <jah@FreeBSD.org>
CommitDate: 2023-12-24 05:01:40 +0000

    Avoid waiting on physical allocations that can't possibly be satisfied
    
    - Change vm_page_reclaim_contig[_domain] to return an errno instead
      of a boolean.  0 indicates a successful reclaim, ENOMEM indicates
      lack of available memory to reclaim, with any other error (currently
      only ERANGE) indicating that reclamation is impossible for the
      specified address range.  Change all callers to only follow
      up with vm_page_wait* in the ENOMEM case.
    
    - Introduce vm_domainset_iter_ignore(), which marks the specified
      domain as unavailable for further use by the iterator.  Use this
      function to ignore domains that can't possibly satisfy a physical
      allocation request.  Since WAITOK allocations run the iterators
      repeatedly, this avoids the possibility of infinitely spinning
      in domain iteration if no available domain can satisfy the
      allocation request.
    
    PR:             274252
    Reported by:    kevans
    Tested by:      kevans
    Reviewed by:    markj
    Differential Revision: https://reviews.freebsd.org/D42706
---
 sys/arm/nvidia/drm2/tegra_bo.c              |  9 +++--
 sys/compat/linuxkpi/common/src/linux_page.c |  8 ++--
 sys/dev/drm2/ttm/ttm_bo.c                   |  4 +-
 sys/dev/drm2/ttm/ttm_page_alloc.c           |  9 +++--
 sys/kern/uipc_ktls.c                        |  5 ++-
 sys/kern/uipc_shm.c                         |  5 ++-
 sys/vm/vm_domainset.c                       | 32 +++++++++++++---
 sys/vm/vm_domainset.h                       |  2 +
 sys/vm/vm_kern.c                            | 24 +++++++++++-
 sys/vm/vm_page.c                            | 58 ++++++++++++++++++++++-------
 sys/vm/vm_page.h                            |  6 +--
 11 files changed, 123 insertions(+), 39 deletions(-)

diff --git a/sys/arm/nvidia/drm2/tegra_bo.c b/sys/arm/nvidia/drm2/tegra_bo.c
index 5b9eb8588f4e..1ffd65de9d36 100644
--- a/sys/arm/nvidia/drm2/tegra_bo.c
+++ b/sys/arm/nvidia/drm2/tegra_bo.c
@@ -94,7 +94,7 @@ tegra_bo_alloc_contig(size_t npages, u_long alignment, vm_memattr_t memattr,
     vm_page_t **ret_page)
 {
 	vm_page_t m;
-	int tries, i;
+	int err, i, tries;
 	vm_paddr_t low, high, boundary;
 
 	low = 0;
@@ -106,9 +106,12 @@ retry:
 	    low, high, alignment, boundary, memattr);
 	if (m == NULL) {
 		if (tries < 3) {
-			if (!vm_page_reclaim_contig(0, npages, low, high,
-			    alignment, boundary))
+			err = vm_page_reclaim_contig(0, npages, low, high,
+			    alignment, boundary);
+			if (err == ENOMEM)
 				vm_wait(NULL);
+			else if (err != 0)
+				return (ENOMEM);
 			tries++;
 			goto retry;
 		}
diff --git a/sys/compat/linuxkpi/common/src/linux_page.c b/sys/compat/linuxkpi/common/src/linux_page.c
index eb050b7250be..8b78a3739f25 100644
--- a/sys/compat/linuxkpi/common/src/linux_page.c
+++ b/sys/compat/linuxkpi/common/src/linux_page.c
@@ -118,10 +118,12 @@ linux_alloc_pages(gfp_t flags, unsigned int order)
 			    PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
 			if (page == NULL) {
 				if (flags & M_WAITOK) {
-					if (!vm_page_reclaim_contig(req,
-					    npages, 0, pmax, PAGE_SIZE, 0)) {
+					int err = vm_page_reclaim_contig(req,
+					    npages, 0, pmax, PAGE_SIZE, 0);
+					if (err == ENOMEM)
 						vm_wait(NULL);
-					}
+					else if (err != 0)
+						return (NULL);
 					flags &= ~M_WAITOK;
 					goto retry;
 				}
diff --git a/sys/dev/drm2/ttm/ttm_bo.c b/sys/dev/drm2/ttm/ttm_bo.c
index f60cb4f37535..b57d961338f4 100644
--- a/sys/dev/drm2/ttm/ttm_bo.c
+++ b/sys/dev/drm2/ttm/ttm_bo.c
@@ -1498,8 +1498,8 @@ retry:
 	    VM_MAX_ADDRESS, PAGE_SIZE, 0, VM_MEMATTR_UNCACHEABLE);
 
 	if (unlikely(glob->dummy_read_page == NULL)) {
-		if (tries < 1 && vm_page_reclaim_contig(0, 1, 0,
-		    VM_MAX_ADDRESS, PAGE_SIZE, 0)) {
+		if (tries < 1 && (vm_page_reclaim_contig(0, 1, 0,
+		    VM_MAX_ADDRESS, PAGE_SIZE, 0) == 0)) {
 			tries++;
 			goto retry;
 		}
diff --git a/sys/dev/drm2/ttm/ttm_page_alloc.c b/sys/dev/drm2/ttm/ttm_page_alloc.c
index 67c484218cc2..7518ecb4dfd1 100644
--- a/sys/dev/drm2/ttm/ttm_page_alloc.c
+++ b/sys/dev/drm2/ttm/ttm_page_alloc.c
@@ -158,16 +158,19 @@ static vm_page_t
 ttm_vm_page_alloc_dma32(int req, vm_memattr_t memattr)
 {
 	vm_page_t p;
-	int tries;
+	int err, tries;
 
 	for (tries = 0; ; tries++) {
 		p = vm_page_alloc_noobj_contig(req, 1, 0, 0xffffffff, PAGE_SIZE,
 		    0, memattr);
 		if (p != NULL || tries > 2)
 			return (p);
-		if (!vm_page_reclaim_contig(req, 1, 0, 0xffffffff,
-		    PAGE_SIZE, 0))
+		err = vm_page_reclaim_contig(req, 1, 0, 0xffffffff,
+		    PAGE_SIZE, 0);
+		if (err == ENOMEM)
 			vm_wait(NULL);
+		else if (err != 0)
+			return (NULL);
 	}
 }
 
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index c996157dbc41..b69d16446b47 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -3160,8 +3160,9 @@ ktls_reclaim_thread(void *ctx)
 		 * backlogs of buffers to be encrypted, leading to
 		 * surges of traffic and potential NIC output drops.
 		 */
-		if (!vm_page_reclaim_contig_domain_ext(domain, VM_ALLOC_NORMAL,
-		    atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0, ktls_max_reclaim)) {
+		if (vm_page_reclaim_contig_domain_ext(domain, VM_ALLOC_NORMAL,
+		    atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
+		    ktls_max_reclaim) != 0) {
 			vm_wait_domain(domain);
 		} else {
 			sc->reclaims += ktls_max_reclaim;
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index a8e2502808a0..f5803d1d72de 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -877,8 +877,9 @@ shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
 			}
 			error = vm_page_reclaim_contig(aflags,
 			    pagesizes[psind] / PAGE_SIZE, 0, ~0,
-			    pagesizes[psind], 0) ? 0 :
-			    vm_wait_intr(object);
+			    pagesizes[psind], 0);
+			if (error == ENOMEM)
+				error = vm_wait_intr(object);
 			if (error != 0) {
 				VM_OBJECT_WLOCK(object);
 				return (error);
diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c
index 9bc0df865154..b881466bffe5 100644
--- a/sys/vm/vm_domainset.c
+++ b/sys/vm/vm_domainset.c
@@ -68,6 +68,7 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds,
 	di->di_domain = ds;
 	di->di_iter = iter;
 	di->di_policy = ds->ds_policy;
+	DOMAINSET_COPY(&ds->ds_mask, &di->di_valid_mask);
 	if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) {
 #if VM_NRESERVLEVEL > 0
 		if (vm_object_reserv(obj)) {
@@ -158,7 +159,7 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
 	switch (di->di_policy) {
 	case DOMAINSET_POLICY_FIRSTTOUCH:
 		*domain = PCPU_GET(domain);
-		if (DOMAINSET_ISSET(*domain, &di->di_domain->ds_mask)) {
+		if (DOMAINSET_ISSET(*domain, &di->di_valid_mask)) {
 			/*
 			 * Add an extra iteration because we will visit the
 			 * current domain a second time in the rr iterator.
@@ -221,11 +222,14 @@ int
 vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
     int *domain)
 {
+	if (__predict_false(DOMAINSET_EMPTY(&di->di_valid_mask)))
+		return (ENOMEM);
 
 	/* If there are more domains to visit we run the iterator. */
 	while (--di->di_n != 0) {
 		vm_domainset_iter_next(di, domain);
-		if (!di->di_minskip || !vm_page_count_min_domain(*domain))
+		if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
+		    (!di->di_minskip || !vm_page_count_min_domain(*domain)))
 			return (0);
 	}
 
@@ -243,7 +247,7 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj,
 	/* Wait for one of the domains to accumulate some free pages. */
 	if (obj != NULL)
 		VM_OBJECT_WUNLOCK(obj);
-	vm_wait_doms(&di->di_domain->ds_mask, 0);
+	vm_wait_doms(&di->di_valid_mask, 0);
 	if (obj != NULL)
 		VM_OBJECT_WLOCK(obj);
 	if ((di->di_flags & VM_ALLOC_WAITFAIL) != 0)
@@ -288,11 +292,14 @@ vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
 int
 vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
 {
+	if (DOMAINSET_EMPTY(&di->di_valid_mask))
+		return (ENOMEM);
 
 	/* If there are more domains to visit we run the iterator. */
 	while (--di->di_n != 0) {
 		vm_domainset_iter_next(di, domain);
-		if (!di->di_minskip || !vm_page_count_min_domain(*domain))
+		if (DOMAINSET_ISSET(*domain, &di->di_valid_mask) &&
+		    (!di->di_minskip || !vm_page_count_min_domain(*domain)))
 			return (0);
 	}
 
@@ -308,7 +315,7 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
 		return (ENOMEM);
 
 	/* Wait for one of the domains to accumulate some free pages. */
-	vm_wait_doms(&di->di_domain->ds_mask, 0);
+	vm_wait_doms(&di->di_valid_mask, 0);
 
 	/* Restart the search. */
 	vm_domainset_iter_first(di, domain);
@@ -316,6 +323,15 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain)
 	return (0);
 }
 
+void
+vm_domainset_iter_ignore(struct vm_domainset_iter *di, int domain)
+{
+	KASSERT(DOMAINSET_ISSET(domain, &di->di_valid_mask),
+	    ("%s: domain %d not present in di_valid_mask for di %p",
+	    __func__, domain, di));
+	DOMAINSET_CLR(domain, &di->di_valid_mask);
+}
+
 #else /* !NUMA */
 
 int
@@ -357,4 +373,10 @@ vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di,
 	*domain = 0;
 }
 
+void
+vm_domainset_iter_ignore(struct vm_domainset_iter *di __unused,
+    int domain __unused)
+{
+}
+
 #endif /* NUMA */
diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h
index 4ce619069dd0..d2cfe362ae78 100644
--- a/sys/vm/vm_domainset.h
+++ b/sys/vm/vm_domainset.h
@@ -31,6 +31,7 @@
 struct vm_domainset_iter {
 	struct domainset	*di_domain;
 	unsigned int		*di_iter;
+	domainset_t		di_valid_mask;
 	vm_pindex_t		di_offset;
 	int			di_flags;
 	uint16_t		di_policy;
@@ -47,6 +48,7 @@ void	vm_domainset_iter_policy_init(struct vm_domainset_iter *,
 	    struct domainset *, int *, int *);
 void	vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *,
 	    struct domainset_ref *, int *, int *);
+void	vm_domainset_iter_ignore(struct vm_domainset_iter *, int);
 
 int	vm_wait_doms(const domainset_t *, int mflags);
 
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index b17b857bd902..1ef3154845b3 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -232,8 +232,8 @@ kmem_alloc_contig_pages(vm_object_t object, vm_pindex_t pindex, int domain,
 			break;
 
 		VM_OBJECT_WUNLOCK(object);
-		if (!vm_page_reclaim_contig_domain(domain, pflags, npages,
-		    low, high, alignment, boundary) && wait)
+		if (vm_page_reclaim_contig_domain(domain, pflags, npages,
+		    low, high, alignment, boundary) == ENOMEM && wait)
 			vm_wait_domain(domain);
 		VM_OBJECT_WLOCK(object);
 	}
@@ -306,8 +306,12 @@ kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
+	vm_page_t bounds[2];
 	void *addr;
 	int domain;
+	int start_segind;
+
+	start_segind = -1;
 
 	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
 	do {
@@ -315,6 +319,12 @@ kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags,
 		    memattr);
 		if (addr != NULL)
 			break;
+		if (start_segind == -1)
+			start_segind = vm_phys_lookup_segind(low);
+		if (vm_phys_find_range(bounds, start_segind, domain,
+		    atop(round_page(size)), low, high) == -1) {
+			vm_domainset_iter_ignore(&di, domain);
+		}
 	} while (vm_domainset_iter_policy(&di, &domain) == 0);
 
 	return (addr);
@@ -390,8 +400,12 @@ kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags,
     vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
+	vm_page_t bounds[2];
 	void *addr;
 	int domain;
+	int start_segind;
+
+	start_segind = -1;
 
 	vm_domainset_iter_policy_init(&di, ds, &domain, &flags);
 	do {
@@ -399,6 +413,12 @@ kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags,
 		    alignment, boundary, memattr);
 		if (addr != NULL)
 			break;
+		if (start_segind == -1)
+			start_segind = vm_phys_lookup_segind(low);
+		if (vm_phys_find_range(bounds, start_segind, domain,
+		    atop(round_page(size)), low, high) == -1) {
+			vm_domainset_iter_ignore(&di, domain);
+		}
 	} while (vm_domainset_iter_policy(&di, &domain) == 0);
 
 	return (addr);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 745b689a3591..72057f2fc9f5 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -2170,8 +2170,12 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
 	struct vm_domainset_iter di;
+	vm_page_t bounds[2];
 	vm_page_t m;
 	int domain;
+	int start_segind;
+
+	start_segind = -1;
 
 	vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
 	do {
@@ -2179,6 +2183,12 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
 		    npages, low, high, alignment, boundary, memattr);
 		if (m != NULL)
 			break;
+		if (start_segind == -1)
+			start_segind = vm_phys_lookup_segind(low);
+		if (vm_phys_find_range(bounds, start_segind, domain,
+		    npages, low, high) == -1) {
+			vm_domainset_iter_ignore(&di, domain);
+		}
 	} while (vm_domainset_iter_page(&di, object, &domain) == 0);
 
 	return (m);
@@ -3022,7 +3032,7 @@ unlock:
  *	"npages" must be greater than zero.  Both "alignment" and "boundary"
  *	must be a power of two.
  */
-bool
+int
 vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     int desired_runs)
@@ -3030,14 +3040,15 @@ vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
 	struct vm_domain *vmd;
 	vm_page_t bounds[2], m_run, _m_runs[NRUNS], *m_runs;
 	u_long count, minalign, reclaimed;
-	int error, i, min_reclaim, nruns, options, req_class, segind;
-	bool ret;
+	int error, i, min_reclaim, nruns, options, req_class;
+	int segind, start_segind;
+	int ret;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 
-	ret = false;
+	ret = ENOMEM;
 
 	/*
 	 * If the caller wants to reclaim multiple runs, try to allocate
@@ -3077,6 +3088,8 @@ vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
+	start_segind = vm_phys_lookup_segind(low);
+
 	/*
 	 * Return if the number of free pages cannot satisfy the requested
 	 * allocation.
@@ -3093,14 +3106,17 @@ vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
 	 * the reclamation of reservations and superpages each time.
 	 */
 	for (options = VPSC_NORESERV;;) {
+		bool phys_range_exists = false;
+
 		/*
 		 * Find the highest runs that satisfy the given constraints
 		 * and restrictions, and record them in "m_runs".
 		 */
 		count = 0;
-		segind = vm_phys_lookup_segind(low);
+		segind = start_segind;
 		while ((segind = vm_phys_find_range(bounds, segind, domain,
 		    npages, low, high)) != -1) {
+			phys_range_exists = true;
 			while ((m_run = vm_page_scan_contig(npages, bounds[0],
 			    bounds[1], alignment, boundary, options))) {
 				bounds[0] = m_run + npages;
@@ -3110,6 +3126,11 @@ vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
 			segind++;
 		}
 
+		if (!phys_range_exists) {
+			ret = ERANGE;
+			goto done;
+		}
+
 		/*
 		 * Reclaim the highest runs in LIFO (descending) order until
 		 * the number of reclaimed pages, "reclaimed", is at least
@@ -3126,7 +3147,7 @@ vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
 			if (error == 0) {
 				reclaimed += npages;
 				if (reclaimed >= min_reclaim) {
-					ret = true;
+					ret = 0;
 					goto done;
 				}
 			}
@@ -3141,7 +3162,8 @@ vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
 		else if (options == VPSC_NOSUPER)
 			options = VPSC_ANY;
 		else if (options == VPSC_ANY) {
-			ret = reclaimed != 0;
+			if (reclaimed != 0)
+				ret = 0;
 			goto done;
 		}
 	}
@@ -3151,7 +3173,7 @@ done:
 	return (ret);
 }
 
-bool
+int
 vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
@@ -3159,20 +3181,28 @@ vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
 	    alignment, boundary, 1));
 }
 
-bool
+int
 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
 	struct vm_domainset_iter di;
-	int domain;
-	bool ret;
+	int domain, ret, status;
+
+	ret = ERANGE;
 
 	vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
 	do {
-		ret = vm_page_reclaim_contig_domain(domain, req, npages, low,
+		status = vm_page_reclaim_contig_domain(domain, req, npages, low,
 		    high, alignment, boundary);
-		if (ret)
-			break;
+		if (status == 0)
+			return (0);
+		else if (status == ERANGE)
+			vm_domainset_iter_ignore(&di, domain);
+		else {
+			KASSERT(status == ENOMEM, ("Unrecognized error %d "
+			    "from vm_page_reclaim_contig_domain()", status));
+			ret = ENOMEM;
+		}
 	} while (vm_domainset_iter_page(&di, NULL, &domain) == 0);
 
 	return (ret);
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 735c358320ff..e6bff3334d39 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -660,11 +660,11 @@ vm_page_t vm_page_prev(vm_page_t m);
 bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
-bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
+int vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
-bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
+int vm_page_reclaim_contig_domain(int domain, int req, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
-bool vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
+int vm_page_reclaim_contig_domain_ext(int domain, int req, u_long npages,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     int desired_runs);
 void vm_page_reference(vm_page_t m);