git: 83ad6d8d8eee - stable/15 - vm_domainset: Refactor iterators, multiple fixes
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 19 Sep 2025 10:07:18 UTC
The branch stable/15 has been updated by olce: URL: https://cgit.FreeBSD.org/src/commit/?id=83ad6d8d8eee49223679ce9e32860dcd0ea26434 commit 83ad6d8d8eee49223679ce9e32860dcd0ea26434 Author: Olivier Certner <olce@FreeBSD.org> AuthorDate: 2025-07-08 12:28:31 +0000 Commit: Olivier Certner <olce@FreeBSD.org> CommitDate: 2025-09-19 10:07:04 +0000 vm_domainset: Refactor iterators, multiple fixes vm_domainset_iter_first() would not check if the initial domain selected by the policy was effectively valid (i.e., allowed by the domainset and not marked as ignored by vm_domainset_iter_ignore()). It would just try to skip it if it had less pages than 'free_min', and would not take into account the possibility of no domains being valid. Factor out code that logically belongs to the iterator machinery and is not tied to how allocations (or impossibility thereof) are to be handled. This allows to remove duplicated code between vm_domainset_iter_page() and vm_domainset_iter_policy(), and between vm_domainset_iter_page_init() and _vm_domainset_iter_policy_init(). This also allows to remove the 'pages' parameter from vm_domainset_iter_page_init(). This also makes the two-phase logic clearer, revealing an inconsistency between setting 'di_minskip' to true in vm_domainset_iter_init() (implying that, in the case of waiting allocations, further attempts after the first sleep should just allocate for the first domain, regardless of their situation with respect to their 'free_min') and trying to skip the first domain if it has too few pages in vm_domainset_iter_page_init() and _vm_domainset_iter_policy_init(). Fix this inconsistency by resetting 'di_minskip' to 'true' in vm_domainset_iter_first() instead so that, after each vm_wait_doms() (waiting allocations that could not be satisfied immediately), we again start with only the domains that have more than 'free_min' pages. While here, fix the minor quirk that the round-robin policy would start with the domain after the one pointed to by the initial value of 'di_iter' (this just affects the case of resetting '*di_iter', and would not cause domain skips in other circumstances, i.e., for waiting allocations that actually wait or at each subsequent new iterator creation with same iteration index storage). PR: 277476 Tested by: Kenneth Raplee <kenrap_kennethraplee.com> Fixes: 7b11a4832691 ("Add files for r327895") Fixes: e5818a53dbd2 ("Implement several enhancements to NUMA policies.") Fixes: 23984ce5cd24 ("Avoid resource deadlocks when one domain has exhausted its memory."...) MFC after: 10 days Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D51251 (cherry picked from commit 637d9858e6a8b4a8a3ee4dd80743a58bde4cbd68) --- sys/kern/kern_malloc.c | 11 ++- sys/vm/uma_core.c | 10 ++- sys/vm/vm_domainset.c | 238 +++++++++++++++++++++++++++++-------------------- sys/vm/vm_domainset.h | 9 +- sys/vm/vm_glue.c | 2 +- sys/vm/vm_kern.c | 12 ++- sys/vm/vm_page.c | 21 +++-- 7 files changed, 181 insertions(+), 122 deletions(-) diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 879220be050b..653ce1ee556b 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -751,11 +751,14 @@ malloc_domainset(size_t size, struct malloc_type *mtp, struct domainset *ds, return (malloc_large(size, mtp, DOMAINSET_RR(), flags DEBUG_REDZONE_ARG)); - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); - do { - va = malloc_domain(&size, &indx, mtp, domain, flags); - } while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0); + indx = -1; + va = NULL; + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) == 0) + do { + va = malloc_domain(&size, &indx, mtp, domain, flags); + } while (va == NULL && vm_domainset_iter_policy(&di, &domain) == 0); malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx); + if (__predict_false(va == NULL)) { KASSERT((flags & M_WAITOK) == 0, ("malloc(M_WAITOK) returned NULL")); diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c index 5189f7405400..679b2e20e88b 100644 --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -4017,8 +4017,9 @@ restart: rr = rdomain == UMA_ANYDOMAIN; if (rr) { aflags = (flags & ~M_WAITOK) | M_NOWAIT; - vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, - &aflags); + if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, + &aflags) != 0) + return (NULL); } else { aflags = flags; domain = rdomain; @@ -5245,8 +5246,9 @@ uma_prealloc(uma_zone_t zone, int items) slabs = howmany(items, keg->uk_ipers); while (slabs-- > 0) { aflags = M_NOWAIT; - vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, - &aflags); + if (vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, + &aflags) != 0) + panic("%s: Domainset is empty", __func__); for (;;) { slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, aflags); diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c index a46dbe5ae92e..9fa17da954f7 100644 --- a/sys/vm/vm_domainset.c +++ b/sys/vm/vm_domainset.c @@ -58,6 +58,9 @@ static int vm_domainset_default_stride = 64; +static bool vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain); + + /* * Determine which policy is to be used for this allocation. */ @@ -93,8 +96,6 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct domainset *ds, pindex += (((uintptr_t)obj) / sizeof(*obj)); di->di_offset = pindex; } - /* Skip domains below min on the first pass. */ - di->di_minskip = true; } static void @@ -103,7 +104,7 @@ vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain) /* Grab the next domain in 'ds_order'. */ *domain = di->di_domain->ds_order[ - ++(*di->di_iter) % di->di_domain->ds_cnt]; + (*di->di_iter)++ % di->di_domain->ds_cnt]; } static void @@ -116,31 +117,21 @@ vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain) *domain = di->di_domain->ds_order[d]; } -static void -vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) -{ - - KASSERT(!DOMAINSET_EMPTY(&di->di_remain_mask), - ("%s: Already iterated on all domains", __func__)); - vm_domainset_iter_rr(di, domain); - KASSERT(*domain < vm_ndomains, - ("%s: Invalid domain %d", __func__, *domain)); -} - -static void -vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) +/* + * Internal function determining the current phase's first candidate domain. + * + * Returns whether these is an eligible domain, which is returned through + * '*domain'. '*domain' can be modified even if there is no eligible domain. + * + * See herald comment of vm_domainset_iter_first() below about phases. + */ +static bool +vm_domainset_iter_phase_first(struct vm_domainset_iter *di, int *domain) { - switch (di->di_policy) { case DOMAINSET_POLICY_FIRSTTOUCH: *domain = PCPU_GET(domain); - if (DOMAINSET_ISSET(*domain, &di->di_valid_mask)) - break; - /* - * To prevent impossible allocations we convert an invalid - * first-touch to round-robin. - */ - /* FALLTHROUGH */ + break; case DOMAINSET_POLICY_ROUNDROBIN: vm_domainset_iter_rr(di, domain); break; @@ -156,25 +147,114 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) KASSERT(*domain < vm_ndomains, ("%s: Invalid domain %d", __func__, *domain)); + /* + * Has the policy's start domain already been visited? + */ + if (!DOMAINSET_ISSET(*domain, &di->di_remain_mask)) + return (vm_domainset_iter_next(di, domain)); + + DOMAINSET_CLR(*domain, &di->di_remain_mask); + + /* Does it have enough free pages (phase 1)? */ + if (di->di_minskip && vm_page_count_min_domain(*domain)) { + /* Mark the domain as eligible for phase 2. */ + DOMAINSET_SET(*domain, &di->di_min_mask); + return (vm_domainset_iter_next(di, domain)); + } + + return (true); +} + +/* + * Resets an iterator to point to the first candidate domain. + * + * Returns whether there is an eligible domain to start with. '*domain' may be + * modified even if there is none. + * + * There must have been one call to vm_domainset_iter_init() before. + * + * This function must be called at least once before calling + * vm_domainset_iter_next(). Note that functions wrapping + * vm_domainset_iter_init() usually do that themselves. + * + * This function may be called again to reset the iterator to the policy's first + * candidate domain. After each reset, the iterator will visit the same domains + * as in the previous iteration minus those on which vm_domainset_iter_ignore() + * has been called. Note that the first candidate domain may change at each + * reset (at time of this writing, only on the DOMAINSET_POLICY_ROUNDROBIN + * policy). + * + * Domains which have a number of free pages over 'v_free_min' are always + * visited first (this is called the "phase 1" in comments, "phase 2" being the + * examination of the remaining domains; no domains are ever visited twice). + */ +static bool +vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain) +{ /* Initialize the mask of domains to visit. */ + DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask); + /* + * No candidate domains for phase 2 at start. This will be filled by + * phase 1. + */ + DOMAINSET_ZERO(&di->di_min_mask); + /* Skip domains below 'v_free_min' on phase 1. */ + di->di_minskip = true; + + return (vm_domainset_iter_phase_first(di, domain)); +} + +/* + * Advances the iterator to the next candidate domain. + * + * Returns whether there was another domain to visit. '*domain' may be modified + * even if there is none. + * + * vm_domainset_iter_first() must have been called at least once before using + * this function (see its herald comment for more details on iterators). + */ +static bool +vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain) +{ + /* Loop while there remains domains to visit in the current phase. */ + while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { + /* Grab the next domain in 'ds_order'. */ + vm_domainset_iter_rr(di, domain); + KASSERT(*domain < vm_ndomains, + ("%s: Invalid domain %d", __func__, *domain)); + + if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { + DOMAINSET_CLR(*domain, &di->di_remain_mask); + if (!di->di_minskip || !vm_page_count_min_domain(*domain)) + return (true); + DOMAINSET_SET(*domain, &di->di_min_mask); + } + } + + /* + * If phase 1 (skip low memory domains) is over, start phase 2 (consider + * low memory domains). + */ if (di->di_minskip) { - /* Phase 1: Skip domains under 'v_free_min'. */ - DOMAINSET_COPY(&di->di_valid_mask, &di->di_remain_mask); - DOMAINSET_ZERO(&di->di_min_mask); - } else - /* Phase 2: Browse domains that were under 'v_free_min'. */ + di->di_minskip = false; + /* Browse domains that were under 'v_free_min'. */ DOMAINSET_COPY(&di->di_min_mask, &di->di_remain_mask); + return (vm_domainset_iter_phase_first(di, domain)); + } - /* Mark first domain as seen. */ - DOMAINSET_CLR(*domain, &di->di_remain_mask); + return (false); } -void +int vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, - vm_pindex_t pindex, int *domain, int *req, struct pctrie_iter *pages) + vm_pindex_t pindex, int *domain, int *req) { struct domainset_ref *dr; + di->di_flags = *req; + *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | + VM_ALLOC_NOWAIT; + /* * Object policy takes precedence over thread policy. The policies * are immutable and unsynchronized. Updates can race but pointer @@ -184,39 +264,21 @@ vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, dr = &obj->domain; else dr = &curthread->td_domain; + vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, obj, pindex); - di->di_flags = *req; - *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | - VM_ALLOC_NOWAIT; - vm_domainset_iter_first(di, domain); - if (vm_page_count_min_domain(*domain)) - vm_domainset_iter_page(di, obj, domain, pages); + /* + * XXXOC: Shouldn't we just panic on 'false' if VM_ALLOC_WAITOK was + * passed? + */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } int vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, int *domain, struct pctrie_iter *pages) { - if (__predict_false(DOMAINSET_EMPTY(&di->di_valid_mask))) - return (ENOMEM); - - /* If there are more domains to visit in this phase, run the iterator. */ - while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { - vm_domainset_iter_next(di, domain); - if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { - DOMAINSET_CLR(*domain, &di->di_remain_mask); - if (!di->di_minskip || !vm_page_count_min_domain(*domain)) - return (0); - DOMAINSET_SET(*domain, &di->di_min_mask); - } - } - - /* If we skipped domains below min restart the search. */ - if (di->di_minskip) { - di->di_minskip = false; - vm_domainset_iter_first(di, domain); + if (vm_domainset_iter_next(di, domain)) return (0); - } /* If we visited all domains and this was a NOWAIT we return error. */ if ((di->di_flags & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) == 0) @@ -235,64 +297,43 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, return (ENOMEM); /* Restart the search. */ - vm_domainset_iter_first(di, domain); - - return (0); + /* XXXOC: Shouldn't we just panic on 'false'? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } -static void +static int _vm_domainset_iter_policy_init(struct vm_domainset_iter *di, int *domain, int *flags) { - di->di_flags = *flags; *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; - vm_domainset_iter_first(di, domain); - if (vm_page_count_min_domain(*domain)) - vm_domainset_iter_policy(di, domain); + /* XXXOC: Shouldn't we just panic on 'false' if M_WAITOK was passed? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } -void +int vm_domainset_iter_policy_init(struct vm_domainset_iter *di, struct domainset *ds, int *domain, int *flags) { vm_domainset_iter_init(di, ds, &curthread->td_domain.dr_iter, NULL, 0); - _vm_domainset_iter_policy_init(di, domain, flags); + return (_vm_domainset_iter_policy_init(di, domain, flags)); } -void +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, struct domainset_ref *dr, int *domain, int *flags) { vm_domainset_iter_init(di, dr->dr_policy, &dr->dr_iter, NULL, 0); - _vm_domainset_iter_policy_init(di, domain, flags); + return (_vm_domainset_iter_policy_init(di, domain, flags)); } int vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) { - if (DOMAINSET_EMPTY(&di->di_valid_mask)) - return (ENOMEM); - - /* If there are more domains to visit in this phase, run the iterator. */ - while (!DOMAINSET_EMPTY(&di->di_remain_mask)) { - vm_domainset_iter_next(di, domain); - if (DOMAINSET_ISSET(*domain, &di->di_remain_mask)) { - DOMAINSET_CLR(*domain, &di->di_remain_mask); - if (!di->di_minskip || !vm_page_count_min_domain(*domain)) - return (0); - DOMAINSET_SET(*domain, &di->di_min_mask); - } - } - - /* If we skipped domains below min restart the search. */ - if (di->di_minskip) { - di->di_minskip = false; - vm_domainset_iter_first(di, domain); + if (vm_domainset_iter_next(di, domain)) return (0); - } /* If we visited all domains and this was a NOWAIT we return error. */ if ((di->di_flags & M_WAITOK) == 0) @@ -302,9 +343,8 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) vm_wait_doms(&di->di_valid_mask, 0); /* Restart the search. */ - vm_domainset_iter_first(di, domain); - - return (0); + /* XXXOC: Shouldn't we just panic on 'false'? */ + return (vm_domainset_iter_first(di, domain) ? 0 : ENOMEM); } void @@ -326,12 +366,12 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, struct vm_object *obj, return (EJUSTRETURN); } -void +int vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object *obj, - vm_pindex_t pindex, int *domain, int *flags, struct pctrie_iter *pages) + vm_pindex_t pindex, int *domain, int *flags) { - *domain = 0; + return (0); } int @@ -341,20 +381,20 @@ vm_domainset_iter_policy(struct vm_domainset_iter *di, int *domain) return (EJUSTRETURN); } -void +int vm_domainset_iter_policy_init(struct vm_domainset_iter *di, struct domainset *ds, int *domain, int *flags) { - *domain = 0; + return (0); } -void +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *di, struct domainset_ref *dr, int *domain, int *flags) { - *domain = 0; + return (0); } void diff --git a/sys/vm/vm_domainset.h b/sys/vm/vm_domainset.h index b223a4d03df9..ef86c8ccb5e4 100644 --- a/sys/vm/vm_domainset.h +++ b/sys/vm/vm_domainset.h @@ -47,13 +47,12 @@ struct vm_domainset_iter { int vm_domainset_iter_page(struct vm_domainset_iter *, struct vm_object *, int *, struct pctrie_iter *); -void vm_domainset_iter_page_init(struct vm_domainset_iter *, - struct vm_object *, vm_pindex_t, int *, int *, - struct pctrie_iter *); +int vm_domainset_iter_page_init(struct vm_domainset_iter *, + struct vm_object *, vm_pindex_t, int *, int *); int vm_domainset_iter_policy(struct vm_domainset_iter *, int *); -void vm_domainset_iter_policy_init(struct vm_domainset_iter *, +int vm_domainset_iter_policy_init(struct vm_domainset_iter *, struct domainset *, int *, int *); -void vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *, +int vm_domainset_iter_policy_ref_init(struct vm_domainset_iter *, struct domainset_ref *, int *, int *); void vm_domainset_iter_ignore(struct vm_domainset_iter *, int); diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 94df2c2f9a9e..e0f1807a1b32 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -453,7 +453,7 @@ vm_thread_stack_create(struct domainset *ds, int pages) obj = vm_thread_kstack_size_to_obj(pages); if (vm_ndomains > 1) obj->domain.dr_policy = ds; - vm_domainset_iter_page_init(&di, obj, 0, &domain, &req, NULL); + vm_domainset_iter_page_init(&di, obj, 0, &domain, &req); do { /* * Get a kernel virtual address for this thread's kstack. diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index e7d7b6726d2c..ac327aa37b72 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -323,7 +323,9 @@ kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags, start_segind = -1; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0) + return (NULL); + do { addr = kmem_alloc_attr_domain(domain, size, flags, low, high, memattr); @@ -417,7 +419,9 @@ kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags, start_segind = -1; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags)) + return (NULL); + do { addr = kmem_alloc_contig_domain(domain, size, flags, low, high, alignment, boundary, memattr); @@ -517,7 +521,9 @@ kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags) void *addr; int domain; - vm_domainset_iter_policy_init(&di, ds, &domain, &flags); + if (vm_domainset_iter_policy_init(&di, ds, &domain, &flags) != 0) + return (NULL); + do { addr = kmem_malloc_domain(domain, size, flags); if (addr != NULL) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index abad5efb8a79..16878604fa11 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2015,8 +2015,9 @@ vm_page_alloc_iter(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, - pages); + if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_domain_iter(object, pindex, domain, req, pages); @@ -2268,7 +2269,9 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, start_segind = -1; - vm_domainset_iter_page_init(&di, object, pindex, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, object, pindex, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); @@ -2596,7 +2599,9 @@ vm_page_alloc_noobj(int req) vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_noobj_domain(domain, req); if (m != NULL) @@ -2615,7 +2620,9 @@ vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low, vm_page_t m; int domain; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (NULL); + do { m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low, high, alignment, boundary, memattr); @@ -3334,7 +3341,9 @@ vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, ret = ERANGE; - vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req, NULL); + if (vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req) != 0) + return (ret); + do { status = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary);