Re: git: 6468cd8e0ef9 - main - mount: add vnode usage per file system with mount -v

From: Mateusz Guzik <mjguzik_at_gmail.com>
Date: Tue, 14 Jun 2022 20:24:09 UTC
On 6/14/22, Doug Ambrisko <ambrisko@ambrisko.com> wrote:
> On Mon, Jun 13, 2022 at 09:56:38PM +0200, Mateusz Guzik wrote:
> | On 6/13/22, Doug Ambrisko <ambrisko@ambrisko.com> wrote:
> | > On Mon, Jun 13, 2022 at 06:43:31PM +0200, Mateusz Guzik wrote:
> | > | On 6/13/22, Doug Ambrisko <ambrisko@freebsd.org> wrote:
> | > | > The branch main has been updated by ambrisko:
> | > | >
> | > | > URL:
> | > | >
> | >
> https://cgit.FreeBSD.org/src/commit/?id=6468cd8e0ef9d1d3331e9de26cd2be59bc778494
> | > | >
> | > | > commit 6468cd8e0ef9d1d3331e9de26cd2be59bc778494
> | > | > Author:     Doug Ambrisko <ambrisko@FreeBSD.org>
> | > | > AuthorDate: 2022-06-13 14:56:38 +0000
> | > | > Commit:     Doug Ambrisko <ambrisko@FreeBSD.org>
> | > | > CommitDate: 2022-06-13 14:56:38 +0000
> | > | >
> | > | >     mount: add vnode usage per file system with mount -v
> | > | >
> | > | >     This avoids the need to drop into the ddb to figure out vnode
> | > | >     usage per file system.  It helps to see if they are or are not
> | > | >     being freed.  Suggestion to report active vnode count was from
> | > | >     kib@
> | > | >
> | > | >     Reviewed by:    kib
> | > | >     Differential Revision: https://reviews.freebsd.org/D35436
> | > | > ---
> | > | >  sbin/mount/mount.c   |  7 +++++++
> | > | >  sys/kern/vfs_mount.c | 12 ++++++++++++
> | > | >  sys/sys/mount.h      |  4 +++-
> | > | >  3 files changed, 22 insertions(+), 1 deletion(-)
> | > | >
> | > | > diff --git a/sbin/mount/mount.c b/sbin/mount/mount.c
> | > | > index 79d9d6cb0caf..bd3d0073c474 100644
> | > | > --- a/sbin/mount/mount.c
> | > | > +++ b/sbin/mount/mount.c
> | > | > @@ -692,6 +692,13 @@ prmount(struct statfs *sfp)
> | > | >  			xo_emit("{D:, }{Lw:fsid}{:fsid}", fsidbuf);
> | > | >  			free(fsidbuf);
> | > | >  		}
> | > | > +		if (sfp->f_nvnodelistsize != 0 || sfp->f_avnodecount != 0) {
> | > | > +			xo_open_container("vnodes");
> | > | > +			xo_emit("{D:,
> | > | > }{Lwc:vnodes}{Lw:count}{w:count/%ju}{Lw:active}{:active/%ju}",
> | > | > +			    (uintmax_t)sfp->f_nvnodelistsize,
> | > | > +			    (uintmax_t)sfp->f_avnodecount);
> | > | > +			xo_close_container("vnodes");
> | > | > +		}
> | > | >  	}
> | > | >  	xo_emit("{D:)}\n");
> | > | >  }
> | > | > diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
> | > | > index 71a40fd97a9c..e3818b67e841 100644
> | > | > --- a/sys/kern/vfs_mount.c
> | > | > +++ b/sys/kern/vfs_mount.c
> | > | > @@ -2610,6 +2610,8 @@ vfs_copyopt(struct vfsoptlist *opts, const
> char
> | > *name,
> | > | > void *dest, int len)
> | > | >  int
> | > | >  __vfs_statfs(struct mount *mp, struct statfs *sbp)
> | > | >  {
> | > | > +	struct vnode *vp;
> | > | > +	uint32_t count;
> | > | >
> | > | >  	/*
> | > | >  	 * Filesystems only fill in part of the structure for updates, we
> | > | > @@ -2624,6 +2626,16 @@ __vfs_statfs(struct mount *mp, struct statfs
> | > *sbp)
> | > | >  	sbp->f_version = STATFS_VERSION;
> | > | >  	sbp->f_namemax = NAME_MAX;
> | > | >  	sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
> | > | > +	sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize;
> | > | > +
> | > | > +	count = 0;
> | > | > +	MNT_ILOCK(mp);
> | > | > +	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
> | > | > +		if (vrefcnt(vp) > 0) /* racy but does not matter */
> | > | > +			count++;
> | > | > +	}
> | > | > +	MNT_IUNLOCK(mp);
> | > | > +	sbp->f_avnodecount = count;
> | > | >
> | > |
> | > | libc uses statfs for dir walk (see gen/fts.c), most notably find
> | > | immediately runs into it. As such the linear scan by default is a
> | > | non-starter.
> | > |
> | > | I don't know if mount is the right place to dump this kind of info to
> | > | begin with, but even so, it should only happen with a dedicated flag.
> | > |
> | > | As statfs does not take any flags on its own, there is no way to
> | > | prevent it from doing the above walk. Perhaps a dedicated sysctl
> which
> | > | takes mount point id could do the walk instead, when asked.
> | > |
> | > | Short of making the walk optional I'm afraid this will have to be
> | > reverted.
> | >
> | > Just to be clear, this isn't breaking things but is not optimal for
> | > things that don't need this extra info.
> | >
> |
> | It's not "not optimal", it's a significant overhead which taxes
> | frequent users which don't benefit from it.
> |
> | For more data I plugged dtrace -n 'fbt::__vfs_statfs:entry {
> | @[execname] = count(); }' while package building, then i got tons of
> | hits:
> | [snip]
> |   expr                                                          13992
> |   install                                                       14090
> |   dirname                                                       14921
> |   mv                                                            17404
> |   ghc-stage1                                                    17577
> |   grep                                                          18998
> |   xgcc                                                          23832
> |   cpp                                                           29282
> |   cc1                                                           36961
> |   sh                                                            70575
> |   rm                                                            73904
> |   ld.lld                                                        87784
> |   sed                                                           88803
> |   c++                                                           98175
> |   cat                                                          115811
> |   cc                                                           449725
> |
>
> I rather not revert it all but revert the active part that kib@ thought
> would be useful versus lazy vnodes.  This is what I propose to commit:
>

I don't have a strong opinion about this one. I would argue a separate
more comprehensive stat reporting should be implemented, but for the
time being I'm fine with whatever which sorts out the immediate
problem.

> diff --git a/sbin/mount/mount.c b/sbin/mount/mount.c
> index bd3d0073c47..6c986907bcd 100644
> --- a/sbin/mount/mount.c
> +++ b/sbin/mount/mount.c
> @@ -692,11 +692,10 @@ prmount(struct statfs *sfp)
>  			xo_emit("{D:, }{Lw:fsid}{:fsid}", fsidbuf);
>  			free(fsidbuf);
>  		}
> -		if (sfp->f_nvnodelistsize != 0 || sfp->f_avnodecount != 0) {
> +		if (sfp->f_nvnodelistsize != 0) {
>  			xo_open_container("vnodes");
> -			xo_emit("{D:,
> }{Lwc:vnodes}{Lw:count}{w:count/%ju}{Lw:active}{:active/%ju}",
> -			    (uintmax_t)sfp->f_nvnodelistsize,
> -			    (uintmax_t)sfp->f_avnodecount);
> +			xo_emit("{D:, }{Lwc:vnodes}{Lw:count}{w:count/%ju}",
> +			    (uintmax_t)sfp->f_nvnodelistsize);
>  			xo_close_container("vnodes");
>  		}
>  	}
> diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
> index e3818b67e84..3c55f83633a 100644
> --- a/sys/kern/vfs_mount.c
> +++ b/sys/kern/vfs_mount.c
> @@ -2610,9 +2610,6 @@ vfs_copyopt(struct vfsoptlist *opts, const char *name,
> void *dest, int len)
>  int
>  __vfs_statfs(struct mount *mp, struct statfs *sbp)
>  {
> -	struct vnode *vp;
> -	uint32_t count;
> -
>  	/*
>  	 * Filesystems only fill in part of the structure for updates, we
>  	 * have to read the entirety first to get all content.
> @@ -2628,15 +2625,6 @@ __vfs_statfs(struct mount *mp, struct statfs *sbp)
>  	sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
>  	sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize;
>
> -	count = 0;
> -	MNT_ILOCK(mp);
> -	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
> -		if (vrefcnt(vp) > 0) /* racy but does not matter */
> -			count++;
> -	}
> -	MNT_IUNLOCK(mp);
> -	sbp->f_avnodecount = count;
> -
>  	return (mp->mnt_op->vfs_statfs(mp, sbp));
>  }
>
> diff --git a/sys/sys/mount.h b/sys/sys/mount.h
> index edac64171f9..ffb2676258f 100644
> --- a/sys/sys/mount.h
> +++ b/sys/sys/mount.h
> @@ -92,7 +92,7 @@ struct statfs {
>  	uint64_t f_syncreads;		/* count of sync reads since mount */
>  	uint64_t f_asyncreads;		/* count of async reads since mount */
>  	uint32_t f_nvnodelistsize;	/* # of vnodes */
> -	uint32_t f_avnodecount;		/* # of active vnodes */
> +	uint32_t f_spare0;		/* unused spare */
>  	uint64_t f_spare[9];		/* unused spare */
>  	uint32_t f_namemax;		/* maximum filename length */
>  	uid_t	  f_owner;		/* user that mounted the filesystem */
>
> the single assignment of:
> 	sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize;
>
> shouldn't cause much of a performance issue.  Output is now:
>   root@client:~ # mount -v
>   192.168.35.1:/data/home/ambrisko/netboot on / (nfs, fsid 01ff003a3a000000,
> vnodes: count 567 )
>   devfs on /dev (devfs, fsid 00ff007171000000, vnodes: count 27 )
>   procfs on /proc (procfs, local, fsid 02ff000202000000, vnodes: count 2 )
>   linprocfs on /compat/linux/proc (linprocfs, local, fsid 03ff00b5b5000000,
> vnodes: count 2 )
>   linsysfs on /compat/linux/sys (linsysfs, local, fsid 04ff008a8a000000,
> vnodes: count 2 )
>   fdescfs on /dev/fd (fdescfs, fsid 05ff005959000000, vnodes: count 3 )
>   tmpfs on /tmp (tmpfs, local, fsid 06ff008787000000, vnodes: count 6 )
>   tmpfs on /var/tmp (tmpfs, local, fsid 07ff008787000000, vnodes: count 2 )
>   tmpfs on /var/run (tmpfs, local, fsid 08ff008787000000, vnodes: count 8 )
>   devfs on /compat/linux/dev (devfs, fsid 09ff007171000000, vnodes: count 4
> )
>   fdescfs on /compat/linux/dev/fd (fdescfs, fsid 0aff005959000000, vnodes:
> count 2 )
>   tmpfs on /compat/linux/dev/shm (tmpfs, local, fsid 0bff008787000000,
> vnodes: count 2 )
>   192.168.31.1:/data on /data (nfs, fsid 0cff003a3a000000, vnodes: count 2
> )
> root@client:~ #
>
> Thanks,
>
> Doug A.
>


-- 
Mateusz Guzik <mjguzik gmail.com>