git: 3d2fec7db856 - main - namei: Add the abilty for the ABI to specify an alternate root path

From: Dmitry Chagin <dchagin_at_FreeBSD.org>
Date: Mon, 29 May 2023 08:20:06 UTC
The branch main has been updated by dchagin:

URL: https://cgit.FreeBSD.org/src/commit/?id=3d2fec7db856c67e1a94a87a846d8ffe6f48b61f

commit 3d2fec7db856c67e1a94a87a846d8ffe6f48b61f
Author:     Dmitry Chagin <dchagin@FreeBSD.org>
AuthorDate: 2023-05-29 08:15:28 +0000
Commit:     Dmitry Chagin <dchagin@FreeBSD.org>
CommitDate: 2023-05-29 08:15:28 +0000

    namei: Add the abilty for the ABI to specify an alternate root path
    
    For now a non-native ABI (i.e., Linux) uses the kern_alternate_path()
    facility to dynamically reroot lookups. First, an attempt is made to
    lookup the file in /compat/linux/original-path. If that fails, the
    lookup is done in /original-path. Thats requires a bit of code in
    every ABI syscall implementation where path name translation is needed.
    Also our kern_alternate_path() does not properly lookups absolute symlinks
    in second attempt, i.e., does not append /compat/linux part to the resolved
    link.
    The change is intended to avoid this by specifiyng the ABI root directory
    for namei(), using one call to pwd_altroot() during exec-time into the ABI.
    In that case namei() will dynamically reroot lookups as mentioned above.
    
    PR:                     72920
    Reviewed by:            kib
    Differential revision:  https://reviews.freebsd.org/D38933
    MFC after:              2 month
---
 sys/kern/kern_descrip.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++--
 sys/kern/vfs_cache.c    |  4 ++--
 sys/kern/vfs_lookup.c   | 30 +++++++++++++++++++++---
 sys/sys/filedesc.h      |  4 ++++
 sys/sys/namei.h         | 11 +++++++--
 5 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 0be59e930dd4..908c3352514b 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -3839,6 +3839,11 @@ pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
 		vrefact(oldpwd->pwd_jdir);
 		newpwd->pwd_jdir = oldpwd->pwd_jdir;
 	}
+
+	if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) {
+		vrefact(oldpwd->pwd_adir);
+		newpwd->pwd_adir = oldpwd->pwd_adir;
+	}
 }
 
 struct pwd *
@@ -3930,6 +3935,8 @@ pwd_drop(struct pwd *pwd)
 		vrele(pwd->pwd_rdir);
 	if (pwd->pwd_jdir != NULL)
 		vrele(pwd->pwd_jdir);
+	if (pwd->pwd_adir != NULL)
+		vrele(pwd->pwd_adir);
 	uma_zfree_smr(pwd_zone, pwd);
 }
 
@@ -3967,6 +3974,8 @@ pwd_chroot(struct thread *td, struct vnode *vp)
 
 	vrefact(vp);
 	newpwd->pwd_rdir = vp;
+	vrefact(vp);
+	newpwd->pwd_adir = vp;
 	if (oldpwd->pwd_jdir == NULL) {
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
@@ -3997,6 +4006,40 @@ pwd_chdir(struct thread *td, struct vnode *vp)
 	pwd_drop(oldpwd);
 }
 
+/*
+ * Process is transitioning to/from a non-native ABI.
+ */
+void
+pwd_altroot(struct thread *td, struct vnode *altroot_vp)
+{
+	struct pwddesc *pdp;
+	struct pwd *newpwd, *oldpwd;
+
+	newpwd = pwd_alloc();
+	pdp = td->td_proc->p_pd;
+	PWDDESC_XLOCK(pdp);
+	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
+	if (altroot_vp != NULL) {
+		/*
+		 * Native process to a non-native ABI.
+		 */
+
+		vrefact(altroot_vp);
+		newpwd->pwd_adir = altroot_vp;
+	} else {
+		/*
+		 * Non-native process to the native ABI.
+		 */
+
+		vrefact(oldpwd->pwd_rdir);
+		newpwd->pwd_adir = oldpwd->pwd_rdir;
+	}
+	pwd_fill(oldpwd, newpwd);
+	pwd_set(pdp, newpwd);
+	PWDDESC_XUNLOCK(pdp);
+	pwd_drop(oldpwd);
+}
+
 /*
  * jail_attach(2) changes both root and working directories.
  */
@@ -4030,6 +4073,8 @@ pwd_chroot_chdir(struct thread *td, struct vnode *vp)
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
 	}
+	vrefact(vp);
+	newpwd->pwd_adir = vp;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
@@ -4046,7 +4091,8 @@ pwd_ensure_dirs(void)
 	pdp = curproc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
-	if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) {
+	if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL &&
+	    oldpwd->pwd_adir != NULL) {
 		PWDDESC_XUNLOCK(pdp);
 		return;
 	}
@@ -4064,6 +4110,10 @@ pwd_ensure_dirs(void)
 		vrefact(rootvnode);
 		newpwd->pwd_rdir = rootvnode;
 	}
+	if (newpwd->pwd_adir == NULL) {
+		vrefact(rootvnode);
+		newpwd->pwd_adir = rootvnode;
+	}
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
@@ -4084,6 +4134,8 @@ pwd_set_rootvnode(void)
 	newpwd->pwd_cdir = rootvnode;
 	vrefact(rootvnode);
 	newpwd->pwd_rdir = rootvnode;
+	vrefact(rootvnode);
+	newpwd->pwd_adir = rootvnode;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
@@ -4119,7 +4171,8 @@ mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 		if (oldpwd == NULL ||
 		    (oldpwd->pwd_cdir != olddp &&
 		    oldpwd->pwd_rdir != olddp &&
-		    oldpwd->pwd_jdir != olddp)) {
+		    oldpwd->pwd_jdir != olddp &&
+		    oldpwd->pwd_adir != olddp)) {
 			PWDDESC_XUNLOCK(pdp);
 			pddrop(pdp);
 			continue;
@@ -4136,6 +4189,10 @@ mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 			vrefact(newdp);
 			newpwd->pwd_jdir = newdp;
 		}
+		if (oldpwd->pwd_adir == olddp) {
+			vrefact(newdp);
+			newpwd->pwd_adir = newdp;
+		}
 		pwd_fill(oldpwd, newpwd);
 		pwd_set(pdp, newpwd);
 		PWDDESC_XUNLOCK(pdp);
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index e4977392349f..8daaf5bc53ad 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -4349,7 +4349,7 @@ cache_fpl_terminated(struct cache_fpl *fpl)
 	(NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
 	 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | WILLBEDIR | \
 	 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
-	 OPENWRITE | WANTIOCTLCAPS)
+	 OPENWRITE | WANTIOCTLCAPS | ISRESTARTED)
 
 #define CACHE_FPL_INTERNAL_CN_FLAGS \
 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
@@ -6238,7 +6238,7 @@ cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
 	fpl.pwd = pwdp;
 	pwd = pwd_get_smr();
 	*(fpl.pwd) = pwd;
-	ndp->ni_rootdir = pwd->pwd_rdir;
+	namei_setup_rootdir(ndp, cnp, pwd);
 	ndp->ni_topdir = pwd->pwd_jdir;
 
 	if (cnp->cn_pnbuf[0] == '/') {
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index a75ea4ca16d6..593e1e487c6f 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -81,6 +81,13 @@ static void NDVALIDATE_impl(struct nameidata *, int);
 #define NDVALIDATE(ndp)
 #endif
 
+#define	NDRESTART(ndp) do {						\
+	NDREINIT_DBG(ndp);						\
+	ndp->ni_resflags = 0;						\
+	ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;			\
+	ndp->ni_cnd.cn_flags |= ISRESTARTED;				\
+} while (0)
+
 SDT_PROVIDER_DEFINE(vfs);
 SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
     "unsigned long", "bool");
@@ -334,7 +341,7 @@ namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
 	 * The reference on ni_rootdir is acquired in the block below to avoid
 	 * back-to-back atomics for absolute lookups.
 	 */
-	ndp->ni_rootdir = pwd->pwd_rdir;
+	namei_setup_rootdir(ndp, cnp, pwd);
 	ndp->ni_topdir = pwd->pwd_jdir;
 
 	if (cnp->cn_pnbuf[0] == '/') {
@@ -594,6 +601,7 @@ namei(struct nameidata *ndp)
 	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
 	    ndp->ni_startdir->v_type == VBAD);
 
+restart:
 	ndp->ni_lcf = 0;
 	ndp->ni_loopcnt = 0;
 	ndp->ni_vp = NULL;
@@ -628,6 +636,12 @@ namei(struct nameidata *ndp)
 	case CACHE_FPL_STATUS_HANDLED:
 		if (error == 0)
 			NDVALIDATE(ndp);
+		else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
+		    (cnp->cn_flags & ISRESTARTED) == 0)) {
+			namei_cleanup_cnp(cnp);
+			NDRESTART(ndp);
+			goto restart;
+		}
 		return (error);
 	case CACHE_FPL_STATUS_PARTIAL:
 		TAILQ_INIT(&ndp->ni_cap_tracker);
@@ -668,8 +682,18 @@ namei(struct nameidata *ndp)
 	for (;;) {
 		ndp->ni_startdir = dp;
 		error = vfs_lookup(ndp);
-		if (error != 0)
-			goto out;
+		if (error != 0) {
+			if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
+			    error == ENOENT &&
+			    (cnp->cn_flags & ISRESTARTED) == 0)) {
+				nameicap_cleanup(ndp);
+				pwd_drop(pwd);
+				namei_cleanup_cnp(cnp);
+				NDRESTART(ndp);
+				goto restart;
+			} else
+				goto out;
+		}
 
 		/*
 		 * If not a symbolic link, we're done.
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index 578b84696663..bba12d08287c 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -89,6 +89,8 @@ struct fdescenttbl {
 /*
  * This struct is copy-on-write and allocated from an SMR zone.
  * All fields are constant after initialization apart from the reference count.
+ * The ABI root directory is initialized as the root directory and changed
+ * during process transiting to or from non-native ABI.
  *
  * Check pwd_* routines for usage.
  */
@@ -97,6 +99,7 @@ struct pwd {
 	struct	vnode	*pwd_cdir;	/* current directory */
 	struct	vnode	*pwd_rdir;	/* root directory */
 	struct	vnode	*pwd_jdir;	/* jail root directory */
+	struct	vnode	*pwd_adir;	/* abi root directory */
 };
 typedef SMR_POINTER(struct pwd *) smrpwd_t;
 
@@ -342,6 +345,7 @@ struct pwddesc *pdinit(struct pwddesc *pdp, bool keeplock);
 struct pwddesc *pdshare(struct pwddesc *pdp);
 void	pdunshare(struct thread *td);
 
+void	pwd_altroot(struct thread *td, struct vnode *altroot_vp);
 void	pwd_chdir(struct thread *td, struct vnode *vp);
 int	pwd_chroot(struct thread *td, struct vnode *vp);
 int	pwd_chroot_chdir(struct thread *td, struct vnode *vp);
diff --git a/sys/sys/namei.h b/sys/sys/namei.h
index e12d79b19c6e..88ddb0f13458 100644
--- a/sys/sys/namei.h
+++ b/sys/sys/namei.h
@@ -159,7 +159,7 @@ int	cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
  * Namei parameter descriptors.
  */
 #define	RDONLY		0x00000200 /* lookup with read-only semantics */
-/* UNUSED		0x00000400 */
+#define	ISRESTARTED	0x00000400 /* restarted namei */
 /* UNUSED		0x00000800 */
 #define	ISWHITEOUT	0x00001000 /* found whiteout */
 #define	DOWHITEOUT	0x00002000 /* do whiteouts */
@@ -187,7 +187,7 @@ int	cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
  */
 #define NAMEI_INTERNAL_FLAGS	\
 	(NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | \
-	 TRAILINGSLASH)
+	 TRAILINGSLASH | ISRESTARTED)
 
 /*
  * Namei results flags
@@ -293,6 +293,13 @@ int	namei(struct nameidata *ndp);
 int	vfs_lookup(struct nameidata *ndp);
 int	vfs_relookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, bool refstart);
+
+#define namei_setup_rootdir(ndp, cnp, pwd) do {					\
+	if (__predict_true((cnp->cn_flags & ISRESTARTED) == 0))			\
+		ndp->ni_rootdir = pwd->pwd_adir;				\
+	else									\
+		ndp->ni_rootdir = pwd->pwd_rdir;				\
+} while (0)
 #endif
 
 /*