Re: git: d3f96f661050 - main - Fix O(n^2) behavior in sysctl

From: Alan Somers <asomers_at_freebsd.org>
Date: Wed, 28 Sep 2022 16:33:38 UTC
I don't think a different interface would necessarily fix all
consistency problems, because there are probably additional
consistency problems within ZFS itself.  The ZFS stats just weren't
designed to provide a consistent view, like "zpool status".  But a
different interface certainly could be much faster; sysctl is slow.
Do you have any good ideas for how to easily create such an alternate
interface?

On Wed, Sep 28, 2022 at 10:19 AM Maxim Sobolev <sobomax@freebsd.org> wrote:
>
> This also brings a question as to whether sysctl is the right interface to pull this data from the kernel in the first place? From my somewhat ignorant look this approach is likely to be poised with all sorts of race conditions, such so if configuration changes while you are pulling it out you'd get some inconsistent view that is not here not there. Wouldn't it be easier to use some other mechanism to pull configuration of all 1,000 datasets as one blob in one or few system calls? Like read(2) from /dev/zfsstats or something like that? Then you can iterate over it as much as you need in userland.
>
> -Max
>
> On Tue, Sep 27, 2022, 3:04 AM Alan Somers <asomers@freebsd.org> wrote:
>>
>> The branch main has been updated by asomers:
>>
>> URL: https://cgit.FreeBSD.org/src/commit/?id=d3f96f661050e9bd21fe29931992a8b9e67ff189
>>
>> commit d3f96f661050e9bd21fe29931992a8b9e67ff189
>> Author:     Alan Somers <asomers@FreeBSD.org>
>> AuthorDate: 2022-09-07 14:12:49 +0000
>> Commit:     Alan Somers <asomers@FreeBSD.org>
>> CommitDate: 2022-09-27 00:03:34 +0000
>>
>>     Fix O(n^2) behavior in sysctl
>>
>>     Sysctl OIDs were internally stored in linked lists, triggering O(n^2)
>>     behavior when userland iterates over many of them.  The slowdown is
>>     noticeable for MIBs that have > 100 children (for example, vm.uma).  But
>>     it's unignorable for kstat.zfs when a pool has > 1000 datasets.
>>
>>     Convert the linked lists into RB trees.  This produces a ~25x speedup
>>     for listing kstat.zfs with 4100 datasets, and no measurable penalty for
>>     small dataset counts.
>>
>>     Bump __FreeBSD_version for the KPI change.
>>
>>     Sponsored by:   Axcient
>>     Reviewed by:    mjg
>>     Differential Revision: https://reviews.freebsd.org/D36500
>> ---
>>  sys/compat/linuxkpi/common/include/linux/sysfs.h |   2 +-
>>  sys/kern/kern_sysctl.c                           | 149 +++++++++++------------
>>  sys/kern/vfs_init.c                              |   2 +-
>>  sys/sys/param.h                                  |   2 +-
>>  sys/sys/sysctl.h                                 |  31 ++++-
>>  5 files changed, 99 insertions(+), 87 deletions(-)
>>
>> diff --git a/sys/compat/linuxkpi/common/include/linux/sysfs.h b/sys/compat/linuxkpi/common/include/linux/sysfs.h
>> index 0b6b479d9362..881a72e62ed9 100644
>> --- a/sys/compat/linuxkpi/common/include/linux/sysfs.h
>> +++ b/sys/compat/linuxkpi/common/include/linux/sysfs.h
>> @@ -246,7 +246,7 @@ sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp)
>>         struct attribute **attr;
>>         struct sysctl_oid *oidp;
>>
>> -       SLIST_FOREACH(oidp, SYSCTL_CHILDREN(kobj->oidp), oid_link) {
>> +       RB_FOREACH(oidp, sysctl_oid_list, SYSCTL_CHILDREN(kobj->oidp)) {
>>                 if (strcmp(oidp->oid_name, grp->name) != 0)
>>                         continue;
>>                 for (attr = grp->attrs; *attr != NULL; attr++) {
>> diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
>> index 9bc595f111cc..e1cd6ea4bd61 100644
>> --- a/sys/kern/kern_sysctl.c
>> +++ b/sys/kern/kern_sysctl.c
>> @@ -84,6 +84,8 @@ static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
>>  static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
>>  static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
>>
>> +RB_GENERATE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid);
>> +
>>  /*
>>   * The sysctllock protects the MIB tree.  It also protects sysctl
>>   * contexts used with dynamic sysctls.  The sysctl_register_oid() and
>> @@ -120,7 +122,7 @@ static struct sx sysctlstringlock;
>>  static int sysctl_root(SYSCTL_HANDLER_ARGS);
>>
>>  /* Root list */
>> -struct sysctl_oid_list sysctl__children = SLIST_HEAD_INITIALIZER(&sysctl__children);
>> +struct sysctl_oid_list sysctl__children = RB_INITIALIZER(&sysctl__children);
>>
>>  static char*   sysctl_escape_name(const char*);
>>  static int     sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
>> @@ -134,7 +136,7 @@ sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
>>         struct sysctl_oid *oidp;
>>
>>         SYSCTL_ASSERT_LOCKED();
>> -       SLIST_FOREACH(oidp, list, oid_link) {
>> +       RB_FOREACH(oidp, sysctl_oid_list, list) {
>>                 if (strcmp(oidp->oid_name, name) == 0) {
>>                         return (oidp);
>>                 }
>> @@ -356,11 +358,14 @@ sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle)
>>         indx = 0;
>>         while (indx < CTL_MAXNAME && indx >= 0) {
>>                 if (nodes[indx] == NULL && indx == 0)
>> -                       nodes[indx] = SLIST_FIRST(&sysctl__children);
>> +                       nodes[indx] = RB_MIN(sysctl_oid_list,
>> +                           &sysctl__children);
>>                 else if (nodes[indx] == NULL)
>> -                       nodes[indx] = SLIST_FIRST(&nodes[indx - 1]->oid_children);
>> +                       nodes[indx] = RB_MIN(sysctl_oid_list,
>> +                           &nodes[indx - 1]->oid_children);
>>                 else
>> -                       nodes[indx] = SLIST_NEXT(nodes[indx], oid_link);
>> +                       nodes[indx] = RB_NEXT(sysctl_oid_list,
>> +                           &nodes[indx - 1]->oid_children, nodes[indx]);
>>
>>                 if (nodes[indx] == needle)
>>                         return (indx + 1);
>> @@ -425,8 +430,7 @@ void
>>  sysctl_register_oid(struct sysctl_oid *oidp)
>>  {
>>         struct sysctl_oid_list *parent = oidp->oid_parent;
>> -       struct sysctl_oid *p;
>> -       struct sysctl_oid *q;
>> +       struct sysctl_oid *p, key;
>>         int oid_number;
>>         int timeout = 2;
>>
>> @@ -476,25 +480,21 @@ sysctl_register_oid(struct sysctl_oid *oidp)
>>          * Insert the OID into the parent's list sorted by OID number.
>>          */
>>  retry:
>> -       q = NULL;
>> -       SLIST_FOREACH(p, parent, oid_link) {
>> -               /* check if the current OID number is in use */
>> -               if (oid_number == p->oid_number) {
>> -                       /* get the next valid OID number */
>> -                       if (oid_number < CTL_AUTO_START ||
>> -                           oid_number == 0x7fffffff) {
>> -                               /* wraparound - restart */
>> -                               oid_number = CTL_AUTO_START;
>> -                               /* don't loop forever */
>> -                               if (!timeout--)
>> -                                       panic("sysctl: Out of OID numbers\n");
>> -                               goto retry;
>> -                       } else {
>> -                               oid_number++;
>> -                       }
>> -               } else if (oid_number < p->oid_number)
>> -                       break;
>> -               q = p;
>> +       key.oid_number = oid_number;
>> +       p = RB_FIND(sysctl_oid_list, parent, &key);
>> +       if (p) {
>> +               /* get the next valid OID number */
>> +               if (oid_number < CTL_AUTO_START ||
>> +                   oid_number == 0x7fffffff) {
>> +                       /* wraparound - restart */
>> +                       oid_number = CTL_AUTO_START;
>> +                       /* don't loop forever */
>> +                       if (!timeout--)
>> +                               panic("sysctl: Out of OID numbers\n");
>> +                       goto retry;
>> +               } else {
>> +                       oid_number++;
>> +               }
>>         }
>>         /* check for non-auto OID number collision */
>>         if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START &&
>> @@ -504,10 +504,7 @@ retry:
>>         }
>>         /* update the OID number, if any */
>>         oidp->oid_number = oid_number;
>> -       if (q != NULL)
>> -               SLIST_INSERT_AFTER(q, oidp, oid_link);
>> -       else
>> -               SLIST_INSERT_HEAD(parent, oidp, oid_link);
>> +       RB_INSERT(sysctl_oid_list, parent, oidp);
>>
>>         if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
>>  #ifdef VIMAGE
>> @@ -556,7 +553,6 @@ sysctl_enable_oid(struct sysctl_oid *oidp)
>>  void
>>  sysctl_unregister_oid(struct sysctl_oid *oidp)
>>  {
>> -       struct sysctl_oid *p;
>>         int error;
>>
>>         SYSCTL_ASSERT_WLOCKED();
>> @@ -564,14 +560,8 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
>>                 error = EINVAL;
>>         } else {
>>                 error = ENOENT;
>> -               SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
>> -                       if (p == oidp) {
>> -                               SLIST_REMOVE(oidp->oid_parent, oidp,
>> -                                   sysctl_oid, oid_link);
>> -                               error = 0;
>> -                               break;
>> -                       }
>> -               }
>> +               if (RB_REMOVE(sysctl_oid_list, oidp->oid_parent, oidp))
>> +                       error = 0;
>>         }
>>
>>         /*
>> @@ -732,17 +722,14 @@ int
>>  sysctl_remove_name(struct sysctl_oid *parent, const char *name,
>>      int del, int recurse)
>>  {
>> -       struct sysctl_oid *p, *tmp;
>> +       struct sysctl_oid *p;
>>         int error;
>>
>>         error = ENOENT;
>>         SYSCTL_WLOCK();
>> -       SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) {
>> -               if (strcmp(p->oid_name, name) == 0) {
>> -                       error = sysctl_remove_oid_locked(p, del, recurse);
>> -                       break;
>> -               }
>> -       }
>> +       p = sysctl_find_oidname(name, &parent->oid_children);
>> +       if (p)
>> +               error = sysctl_remove_oid_locked(p, del, recurse);
>>         SYSCTL_WUNLOCK();
>>
>>         return (error);
>> @@ -811,14 +798,16 @@ sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
>>          */
>>         if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
>>                 if (oidp->oid_refcnt == 1) {
>> -                       SLIST_FOREACH_SAFE(p,
>> -                           SYSCTL_CHILDREN(oidp), oid_link, tmp) {
>> +                       for(p = RB_MIN(sysctl_oid_list, &oidp->oid_children);
>> +                           p != NULL; p = tmp) {
>>                                 if (!recurse) {
>>                                         printf("Warning: failed attempt to "
>>                                             "remove oid %s with child %s\n",
>>                                             oidp->oid_name, p->oid_name);
>>                                         return (ENOTEMPTY);
>>                                 }
>> +                               tmp = RB_NEXT(sysctl_oid_list,
>> +                                   &oidp->oid_children, p);
>>                                 error = sysctl_remove_oid_locked(p, del,
>>                                     recurse);
>>                                 if (error)
>> @@ -895,7 +884,7 @@ sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
>>         }
>>         oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
>>         oidp->oid_parent = parent;
>> -       SLIST_INIT(&oidp->oid_children);
>> +       RB_INIT(&oidp->oid_children);
>>         oidp->oid_number = number;
>>         oidp->oid_refcnt = 1;
>>         oidp->oid_name = escaped;
>> @@ -1016,7 +1005,7 @@ sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
>>         struct sysctl_oid *oidp;
>>
>>         SYSCTL_ASSERT_LOCKED();
>> -       SLIST_FOREACH(oidp, l, oid_link) {
>> +       RB_FOREACH(oidp, sysctl_oid_list, l) {
>>                 for (k=0; k<i; k++)
>>                         printf(" ");
>>
>> @@ -1081,7 +1070,7 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
>>         int *name = (int *) arg1;
>>         u_int namelen = arg2;
>>         int error;
>> -       struct sysctl_oid *oid;
>> +       struct sysctl_oid *oid, key;
>>         struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
>>         struct rm_priotracker tracker;
>>         char buf[10];
>> @@ -1105,10 +1094,9 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
>>                         continue;
>>                 }
>>                 lsp2 = NULL;
>> -               SLIST_FOREACH(oid, lsp, oid_link) {
>> -                       if (oid->oid_number != *name)
>> -                               continue;
>> -
>> +               key.oid_number = *name;
>> +               oid = RB_FIND(sysctl_oid_list, lsp, &key);
>> +               if (oid) {
>>                         if (req->oldidx)
>>                                 error = SYSCTL_OUT(req, ".", 1);
>>                         if (!error)
>> @@ -1120,14 +1108,9 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
>>                         namelen--;
>>                         name++;
>>
>> -                       if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE)
>> -                               break;
>> -
>> -                       if (oid->oid_handler)
>> -                               break;
>> -
>> -                       lsp2 = SYSCTL_CHILDREN(oid);
>> -                       break;
>> +                       if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE &&
>> +                               !oid->oid_handler)
>> +                               lsp2 = SYSCTL_CHILDREN(oid);
>>                 }
>>                 lsp = lsp2;
>>         }
>> @@ -1239,13 +1222,25 @@ static bool
>>  sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int namelen,
>>      int *next, int *len, int level, bool honor_skip)
>>  {
>> -       struct sysctl_oid *oidp;
>> +       struct sysctl_oid_list *next_lsp;
>> +       struct sysctl_oid *oidp = NULL, key;
>>         bool success = false;
>>         enum sysctl_iter_action action;
>>
>>         SYSCTL_ASSERT_LOCKED();
>> -       SLIST_FOREACH(oidp, lsp, oid_link) {
>> -               action = sysctl_sysctl_next_node(oidp, name, namelen, honor_skip);
>> +       /*
>> +        * Start the search at the requested oid.  But if not found, then scan
>> +        * through all children.
>> +        */
>> +       if (namelen > 0) {
>> +               key.oid_number = *name;
>> +               oidp = RB_FIND(sysctl_oid_list, lsp, &key);
>> +       }
>> +       if (!oidp)
>> +               oidp = RB_MIN(sysctl_oid_list, lsp);
>> +       for(; oidp != NULL; oidp = RB_NEXT(sysctl_oid_list, lsp, oidp)) {
>> +               action = sysctl_sysctl_next_node(oidp, name, namelen,
>> +                   honor_skip);
>>                 if (action == ITER_SIBLINGS)
>>                         continue;
>>                 if (action == ITER_FOUND) {
>> @@ -1254,13 +1249,13 @@ sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int namelen,
>>                 }
>>                 KASSERT((action== ITER_CHILDREN), ("ret(%d)!=ITER_CHILDREN", action));
>>
>> -               lsp = SYSCTL_CHILDREN(oidp);
>> +               next_lsp = SYSCTL_CHILDREN(oidp);
>>                 if (namelen == 0) {
>> -                       success = sysctl_sysctl_next_action(lsp, NULL, 0,
>> +                       success = sysctl_sysctl_next_action(next_lsp, NULL, 0,
>>                             next + 1, len, level + 1, honor_skip);
>>                 } else {
>> -                       success = sysctl_sysctl_next_action(lsp, name + 1, namelen - 1,
>> -                           next + 1, len, level + 1, honor_skip);
>> +                       success = sysctl_sysctl_next_action(next_lsp, name + 1,
>> +                           namelen - 1, next + 1, len, level + 1, honor_skip);
>>                         if (!success) {
>>
>>                                 /*
>> @@ -1332,13 +1327,12 @@ name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp)
>>         for (*len = 0; *len < CTL_MAXNAME;) {
>>                 p = strsep(&name, ".");
>>
>> -               oidp = SLIST_FIRST(lsp);
>> -               for (;; oidp = SLIST_NEXT(oidp, oid_link)) {
>> -                       if (oidp == NULL)
>> -                               return (ENOENT);
>> +               RB_FOREACH(oidp, sysctl_oid_list, lsp) {
>>                         if (strcmp(p, oidp->oid_name) == 0)
>>                                 break;
>>                 }
>> +               if (oidp == NULL)
>> +                       return (ENOENT);
>>                 *oid++ = oidp->oid_number;
>>                 (*len)++;
>>
>> @@ -2162,16 +2156,15 @@ sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
>>  {
>>         struct sysctl_oid_list *lsp;
>>         struct sysctl_oid *oid;
>> +       struct sysctl_oid key;
>>         int indx;
>>
>>         SYSCTL_ASSERT_LOCKED();
>>         lsp = &sysctl__children;
>>         indx = 0;
>>         while (indx < CTL_MAXNAME) {
>> -               SLIST_FOREACH(oid, lsp, oid_link) {
>> -                       if (oid->oid_number == name[indx])
>> -                               break;
>> -               }
>> +               key.oid_number = name[indx];
>> +               oid = RB_FIND(sysctl_oid_list, lsp, &key);
>>                 if (oid == NULL)
>>                         return (ENOENT);
>>
>> diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
>> index 6572a8e362c2..6e2e78aaf597 100644
>> --- a/sys/kern/vfs_init.c
>> +++ b/sys/kern/vfs_init.c
>> @@ -525,7 +525,7 @@ vfs_register(struct vfsconf *vfc)
>>          * number.
>>          */
>>         sysctl_wlock();
>> -       SLIST_FOREACH(oidp, SYSCTL_CHILDREN(&sysctl___vfs), oid_link) {
>> +       RB_FOREACH(oidp, sysctl_oid_list, SYSCTL_CHILDREN(&sysctl___vfs)) {
>>                 if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
>>                         sysctl_unregister_oid(oidp);
>>                         oidp->oid_number = vfc->vfc_typenum;
>> diff --git a/sys/sys/param.h b/sys/sys/param.h
>> index f875d839d41f..3f5da06ef951 100644
>> --- a/sys/sys/param.h
>> +++ b/sys/sys/param.h
>> @@ -76,7 +76,7 @@
>>   * cannot include sys/param.h and should only be updated here.
>>   */
>>  #undef __FreeBSD_version
>> -#define __FreeBSD_version 1400070
>> +#define __FreeBSD_version 1400071
>>
>>  /*
>>   * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
>> diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h
>> index 451d83bbe125..3bd77cf87243 100644
>> --- a/sys/sys/sysctl.h
>> +++ b/sys/sys/sysctl.h
>> @@ -39,7 +39,8 @@
>>  #define        _SYS_SYSCTL_H_
>>
>>  #ifdef _KERNEL
>> -#include <sys/queue.h>
>> +#include <sys/tree.h>
>> +#include <sys/systm.h>
>>  #endif
>>
>>  /*
>> @@ -173,20 +174,25 @@ struct sysctl_req {
>>         int              flags;
>>  };
>>
>> -SLIST_HEAD(sysctl_oid_list, sysctl_oid);
>> +struct sysctl_oid;
>> +
>> +/* RB Tree handling */
>> +RB_HEAD(sysctl_oid_list, sysctl_oid);
>>
>>  /*
>>   * This describes one "oid" in the MIB tree.  Potentially more nodes can
>>   * be hidden behind it, expanded by the handler.
>>   */
>>  struct sysctl_oid {
>> -       struct sysctl_oid_list oid_children;
>> -       struct sysctl_oid_list *oid_parent;
>> -       SLIST_ENTRY(sysctl_oid) oid_link;
>> +       struct sysctl_oid_list  oid_children;
>> +       struct sysctl_oid_list* oid_parent;
>> +       RB_ENTRY(sysctl_oid) oid_link;
>> +       /* Sort key for all siblings, and lookup key for userland */
>>         int              oid_number;
>>         u_int            oid_kind;
>>         void            *oid_arg1;
>>         intmax_t         oid_arg2;
>> +       /* Must be unique amongst all siblings. */
>>         const char      *oid_name;
>>         int             (*oid_handler)(SYSCTL_HANDLER_ARGS);
>>         const char      *oid_fmt;
>> @@ -196,6 +202,19 @@ struct sysctl_oid {
>>         const char      *oid_label;
>>  };
>>
>> +static inline int
>> +cmp_sysctl_oid(struct sysctl_oid *a, struct sysctl_oid *b)
>> +{
>> +       if (a->oid_number > b->oid_number)
>> +               return (1);
>> +       else if (a->oid_number < b->oid_number)
>> +               return (-1);
>> +       else
>> +               return (0);
>> +}
>> +
>> +RB_PROTOTYPE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid);
>> +
>>  #define        SYSCTL_IN(r, p, l)      (r->newfunc)(r, p, l)
>>  #define        SYSCTL_OUT(r, p, l)     (r->oldfunc)(r, p, l)
>>  #define        SYSCTL_OUT_STR(r, p)    (r->oldfunc)(r, p, strlen(p) + 1)
>> @@ -275,7 +294,7 @@ TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry);
>>  #define        SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, a2, handler, fmt, descr, label) \
>>         struct sysctl_oid id = {                                        \
>>                 .oid_parent = (parent_child_head),                      \
>> -               .oid_children = SLIST_HEAD_INITIALIZER(&id.oid_children), \
>> +               .oid_children = RB_INITIALIZER(&id.oid_children), \
>>                 .oid_number = (nbr),                                    \
>>                 .oid_kind = (kind),                                     \
>>                 .oid_arg1 = (a1),                                       \
>>