git: 2c2bb7913d47 - stable/14 - vm_domainset: Refactor iterators, multiple fixes
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 19 Sep 2025 11:42:40 UTC
The branch stable/14 has been updated by olce: URL: https://cgit.FreeBSD.org/src/commit/?id=2c2bb7913d478889da548fe9837dc9b32727a81d commit 2c2bb7913d478889da548fe9837dc9b32727a81d Author: Olivier Certner <olce@FreeBSD.org> AuthorDate: 2025-07-08 12:28:31 +0000 Commit: Olivier Certner <olce@FreeBSD.org> CommitDate: 2025-09-19 11:41:49 +0000 vm_domainset: Refactor iterators, multiple fixes vm_domainset_iter_first() would not check if the initial domain selected by the policy was effectively valid (i.e., allowed by the domainset and not marked as ignored by vm_domainset_iter_ignore()). It would just try to skip it if it had less pages than 'free_min', and would not take into account the possibility of no domains being valid. Factor out code that logically belongs to the iterator machinery and is not tied to how allocations (or impossibility thereof) are to be handled. This allows to remove duplicated code between vm_domainset_iter_page() and vm_domainset_iter_policy(), and between vm_domainset_iter_page_init() and _vm_domainset_iter_policy_init(). This also allows to remove the 'pages' parameter from vm_domainset_iter_page_init(). This also makes the two-phase logic clearer, revealing an inconsistency between setting 'di_minskip' to true in vm_domainset_iter_init() (implying that, in the case of waiting allocations, further attempts after the first sleep should just allocate for the first domain, regardless of their situation with respect to their 'free_min') and trying to skip the first domain if it has too few pages in vm_domainset_iter_page_init() and _vm_domainset_iter_policy_init(). Fix this inconsistency by resetting 'di_minskip' to 'true' in vm_domainset_iter_first() instead so that, after each vm_wait_doms() (waiting allocations that could not be satisfied immediately), we again start with only the domains that have more than 'free_min' pages. While here, fix the minor quirk that the round-robin policy would start with the domain after the one pointed to by the initial value of 'di_iter' (this just affects the case of resetting '*di_iter', and would not cause domain skips in other circumstances, i.e., for waiting allocations that actually wait or at each subsequent new iterator creation with same iteration index storage). PR: 277476 Tested by: Kenneth Raplee <kenrap_kennethraplee.com> Fixes: 7b11a4832691 ("Add files for r327895") Fixes: e5818a53dbd2 ("Implement several enhancements to NUMA policies.") Fixes: 23984ce5cd24 ("Avoid resource deadlocks when one domain has exhausted its memory."...) MFC after: 10 days Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D51251 (cherry picked from commit 637d9858e6a8b4a8a3ee4dd80743a58bde4cbd68) --- sys/kern/kern_malloc.c | 11 ++- sys/vm/uma_core.c | 10 ++- sys/vm/vm_domainset.c | 234 +++++++++++++++++++++++++++++-------------------- sys/vm/vm_domainset.h | 6 +- sys/vm/vm_kern.c | 12 ++- sys/vm/vm_page.c | 20 +++-- 6 files changed, 177 insertions(+), 116 deletions(-) diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 94583288430b..5f55ca75f1ed 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -771,11 +771,14 @@ malloc_domainset(size_t size, struct malloc_type *mtp, struct domainset *ds, return (malloc_large(size, mtp, DOMAINSET_RR(), flags DEBUG_REDZONE_ARG)); - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); - do { - va = malloc_domain(&size, &indx, mtp, domain, flags); - } while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0); + indx = -1; + va = NULL; + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) == 0) + do { + va = malloc_domain(&size, &indx, mtp, domain, flags); + } while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0); malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); + if (__predict_false(va == NULL)) { KASSERT((flags & M_WAITOK) == 0, ("malloc(M_WAITOK) returned NULL")); diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 56b7cc601754..dbac8cd3db27 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -3984,8 +3984,9 @@ restart: rr = rdomain == UMA_ANYDOMAIN; if (rr) { aflags = (flags & ~M_WAITOK) | M_NOWAIT; - vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, - &aflags); + if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, + &aflags) != 0) + return (NULL); } else { aflags = flags; domain = rdomain; @@ -5212,8 +5213,9 @@ uma_prealloc(uma_zone_t zone, int items) slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { aflags = M_NOWAIT; - vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, - &aflags); + if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, + &aflags) != 0) + panic("%s: Domainset is empty", __func__); for (;;) { slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, aflags); diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index 27035b69c669..e10cf2540919 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -57,6 +57,9 @@ static int vm_domainset_default_stride = 64; +static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain); + + /* * Determine which policy is to be used for this allocation. */ @@ -89,8 +92,6 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds, pindex += (((uintptr_t)obj) / sizeof(*obj)); di->di_offset = pindex; } - /* Skip domains below min on the first pass. */ - di->di_minskip = true; } static void @@ -99,7 +100,7 @@ vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain) /* Grab the next domain in 'ds_order'. */ *domain = di->di_domain->ds_order[ - ++(*di->di_iter) % di->di_domain->ds_cnt]; + (*di->di_iter)++ % di->di_domain->ds_cnt]; } static void @@ -112,31 +113,21 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) *domain = di->di_domain->ds_order[d]; } -static void -vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) -{ - - KASSERT(!DOMAINSET_EMPTY(&di->di_remain_mask), - ("%s: Already iterated on all domains", __func__)); - vm_domainset_iter_rr(di, domain); - KASSERT(*domain < vm_ndomains, - ("%s: Invalid domain %d", __func__, *domain)); -} - -static void -vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) +/* + * Internal function determining the current phase's first candidate domain. + * + * Returns whether these is an eligible domain, which is returned through + * '*domain'. '*domain' can be modified even if there is no eligible domain. + * + * See herald comment of vm_domainset_iter_first() below about phases. + */ +static bool +vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain) { - switch (di->di_policy) { case DOMAINSET_POLICY_FIRSTTOUCH: *domain = PCPU_GET(domain); - if (DOMAINSET_ISSET(*domain, &di->di_valid_mask)) - break; - /* - * To prevent impossible allocations we convert an invalid - * first-touch to round-robin. - */ - /* FALLTHROUGH */ + break; case DOMAINSET_POLICY_ROUNDROBIN: vm_domainset_iter_rr(di, domain); break; @@ -152,25 +143,114 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) KASSERT(*domain < vm_ndomains, ("%s: Invalid domain %d", __func__, *domain)); + /* + * Has the policy's start domain already been visited? + */ + if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask)) + return (vm_domainset_iter_next(di, domain)); + + DOMAINSET_CLR(*domain, &di->di_remain_mask); + + /* Does it have enough free pages (phase 1)? */ + if (di->di_minskip && vm_page_count_min_domain(*domain)) { + /* Mark the domain as eligible for phase 2. */ + DOMAINSET_SET(*domain, &di->di_min_mask); + return (vm_domainset_iter_next(di, domain)); + } + + return (true); +} + +/* + * Resets an iterator to point to the first candidate domain. + * + * Returns whether there is an eligible domain to start with. '*domain' may be + * modified even if there is none. + * + * There must have been one call to vm_domainset_iter_init() before. + * + * This function must be called at least once before calling + * vm_domainset_iter_next(). Note that functions wrapping + * vm_domainset_iter_init() usually do that themselves. + * + * This function may be called again to reset the iterator to the policy's first + * candidate domain. After each reset, the iterator will visit the same domains + * as in the previous iteration minus those on which vm_domainset_iter_ignore() + * has been called. Note that the first candidate domain may change at each + * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN + * policy). + * + * Domains which have a number of free pages over 'v_free_min' are always + * visited first (this is called the "phase 1" in comments, "phase 2" being the + * examination of the remaining domains; no domains are ever visited twice). + */ +static bool +vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) +{ /* Initialize the mask of domains to visit. */ + DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask); + /* + * No candidate domains for phase 2 at start. This will be filled by + * phase 1. + */ + DOMAINSET_ZERO(&di->di_min_mask); + /* Skip domains below 'v_free_min' on phase 1. */ + di->di_minskip = true; + + return (vm_domainset_iter_phase_first(di, domain)); +} + +/* + * Advances the iterator to the next candidate domain. + * + * Returns whether there was another domain to visit. '*domain' may be modified + * even if there is none. + * + * vm_domainset_iter_first() must have been called at least once before using + * this function (see its herald comment for more details on iterators). + */ +static bool +vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) +{ + /* Loop while there remains domains to visit in the current phase. */ + while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { + /* Grab the next domain in 'ds_order'. */ + vm_domainset_iter_rr(di, domain); + KASSERT(*domain < vm_ndomains, + ("%s: Invalid domain %d", __func__, *domain)); + + if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { + DOMAINSET_CLR(*domain, &di->di_remain_mask); + if (!di->di_minskip || !vm_page_count_min_domain(*domain)) + return (true); + DOMAINSET_SET(*domain, &di->di_min_mask); + } + } + + /* + * If phase 1 (skip low memory domains) is over, start phase 2 (consider + * low memory domains). + */ if (di->di_minskip) { - /* Phase 1: Skip domains under 'v_free_min'. */ - DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask); - DOMAINSET_ZERO(&di->di_min_mask); - } else - /* Phase 2: Browse domains that were under 'v_free_min'. */ + di->di_minskip = false; + /* Browse domains that were under 'v_free_min'. */ DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask); + return (vm_domainset_iter_phase_first(di, domain)); + } - /* Mark first domain as seen. */ - DOMAINSET_CLR(*domain, &di->di_remain_mask); + return (false); } -void +int vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, vm_pindex_t pindex, int *domain, int *req) { struct domainset_ref *dr; + di->di_flags = *req; + *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | + VM_ALLOC_NOWAIT; + /* * Object policy takes precedence over thread policy. The policies * are immutable and unsynchronized. Updates can race but pointer @@ -180,39 +260,21 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, dr = &obj->domain; else dr = &curthread->td_domain; + vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex); - di->di_flags = *req; - *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | - VM_ALLOC_NOWAIT; - vm_domainset_iter_first(di, domain); - if (vm_page_count_min_domain(*domain)) - vm_domainset_iter_page(di, obj, domain); + /* + * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was + * passed? + */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } int vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, int *domain) { - if (__predict_false(DOMAINSET_EMPTY(&di->di_valid_mask))) - return (ENOMEM); - - /* If there are more domains to visit in this phase, run the iterator. */ - while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { - vm_domainset_iter_next(di, domain); - if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { - DOMAINSET_CLR(*domain, &di->di_remain_mask); - if (!di->di_minskip || !vm_page_count_min_domain(*domain)) - return (0); - DOMAINSET_SET(*domain, &di->di_min_mask); - } - } - - /* If we skipped domains below min restart the search. */ - if (di->di_minskip) { - di->di_minskip = false; - vm_domainset_iter_first(di, domain); + if (vm_domainset_iter_next(di, domain)) return (0); - } /* If we visited all domains and this was a NOWAIT we return error. */ if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0) @@ -228,64 +290,43 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, return (ENOMEM); /* Restart the search. */ - vm_domainset_iter_first(di, domain); - - return (0); + /* XXXOC: Shouldn't we just panic on 'false'? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } -static void +static int _vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain, int *flags) { - di->di_flags = *flags; *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; - vm_domainset_iter_first(di, domain); - if (vm_page_count_min_domain(*domain)) - vm_domainset_iter_policy(di, domain); + /* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } -void +int vm_domainset_iter_policy_init(struct vm_domainset_iter *di, struct domainset *ds, int *domain, int *flags) { vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0); - _vm_domainset_iter_policy_init(di, domain, flags); + return (_vm_domainset_iter_policy_init(di, domain, flags)); } -void +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, struct domainset_ref *dr, int *domain, int *flags) { vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0); - _vm_domainset_iter_policy_init(di, domain, flags); + return (_vm_domainset_iter_policy_init(di, domain, flags)); } int vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) { - if (DOMAINSET_EMPTY(&di->di_valid_mask)) - return (ENOMEM); - - /* If there are more domains to visit in this phase, run the iterator. */ - while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { - vm_domainset_iter_next(di, domain); - if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { - DOMAINSET_CLR(*domain, &di->di_remain_mask); - if (!di->di_minskip || !vm_page_count_min_domain(*domain)) - return (0); - DOMAINSET_SET(*domain, &di->di_min_mask); - } - } - - /* If we skipped domains below min restart the search. */ - if (di->di_minskip) { - di->di_minskip = false; - vm_domainset_iter_first(di, domain); + if (vm_domainset_iter_next(di, domain)) return (0); - } /* If we visited all domains and this was a NOWAIT we return error. */ if ((di->di_flags & M_WAITOK) == 0) @@ -295,9 +336,8 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) vm_wait_doms(&di->di_valid_mask, 0); /* Restart the search. */ - vm_domainset_iter_first(di, domain); - - return (0); + /* XXXOC: Shouldn't we just panic on 'false'? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } void @@ -319,12 +359,12 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, return (EJUSTRETURN); } -void +int vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, vm_pindex_t pindex, int *domain, int *flags) { - *domain = 0; + return (0); } int @@ -334,20 +374,20 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) return (EJUSTRETURN); } -void +int vm_domainset_iter_policy_init(struct vm_domainset_iter *di, struct domainset *ds, int *domain, int *flags) { - *domain = 0; + return (0); } -void +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, struct domainset_ref *dr, int *domain, int *flags) { - *domain = 0; + return (0); } void diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h index 35fd1679e1c8..c729467fe08c 100644 --- a/sys/vm/vm_domainset.h +++ b/sys/vm/vm_domainset.h @@ -45,12 +45,12 @@ struct vm_domainset_iter { int vm_domainset_iter_page(struct vm_domainset_iter *, struct vm_object *, int *); -void vm_domainset_iter_page_init(struct vm_domainset_iter *, +int vm_domainset_iter_page_init(struct vm_domainset_iter *, struct vm_object *, vm_pindex_t, int *, int *); int vm_domainset_iter_policy(struct vm_domainset_iter *, int *); -void vm_domainset_iter_policy_init(struct vm_domainset_iter *, +int vm_domainset_iter_policy_init(struct vm_domainset_iter *, struct domainset *, int *, int *); -void vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *, +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *, struct domainset_ref *, int *, int *); void vm_domainset_iter_ignore(struct vm_domainset_iter *, int); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index f11dfd745720..b6e4112ed6c1 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -315,7 +315,9 @@ kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags, start_segind = -1; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0) + return (NULL); + do { addr = kmem_alloc_attr_domain(domain, size, flags, low, high, memattr); @@ -409,7 +411,9 @@ kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags, start_segind = -1; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags)) + return (NULL); + do { addr = kmem_alloc_contig_domain(domain, size, flags, low, high, alignment, boundary, memattr); @@ -507,7 +511,9 @@ kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags) void *addr; int domain; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0) + return (NULL); + do { addr = kmem_malloc_domain(domain, size, flags); if (addr != NULL) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 6daaaa79163e..b23c44ae083d 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1940,7 +1940,9 @@ vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); + if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); @@ -2178,7 +2180,9 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, start_segind = -1; - vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); + if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); @@ -2448,7 +2452,9 @@ vm_page_alloc_noobj(int req) vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_noobj_domain(domain, req); if (m != NULL) @@ -2473,7 +2479,9 @@ vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low, vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low, high, alignment, boundary, memattr); @@ -3191,7 +3199,9 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, ret = ERANGE; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (ret); + do { status = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary);