svn commit: r347151 - in head: libexec/rtld-elf sys/compat/linux sys/fs/deadfs sys/fs/nfsclient sys/fs/nullfs sys/fs/unionfs sys/kern sys/sys sys/ufs/ufs sys/vm

Konstantin Belousov kib at FreeBSD.org
Sun May 5 11:20:48 UTC 2019


Author: kib
Date: Sun May  5 11:20:43 2019
New Revision: 347151
URL: https://svnweb.freebsd.org/changeset/base/347151

Log:
  Switch to use shared vnode locks for text files during image activation.
  
  kern_execve() locks text vnode exclusive to be able to set and clear
  VV_TEXT flag. VV_TEXT is mutually exclusive with the v_writecount > 0
  condition.
  
  The change removes VV_TEXT, replacing it with the condition
  v_writecount <= -1, and puts v_writecount under the vnode interlock.
  Each text reference decrements v_writecount.  To clear the text
  reference when the segment is unmapped, it is recorded in the
  vm_map_entry backed by the text file as MAP_ENTRY_VN_TEXT flag, and
  v_writecount is incremented on the map entry removal
  
  The operations like VOP_ADD_WRITECOUNT() and VOP_SET_TEXT() check that
  v_writecount does not contradict the desired change.  vn_writecheck()
  is now racy and its use was eliminated everywhere except access.
  Atomic check for writeability and increment of v_writecount is
  performed by the VOP.  vn_truncate() now increments v_writecount
  around VOP_SETATTR() call, lack of which is arguably a bug on its own.
  
  nullfs bypasses v_writecount to the lower vnode always, so nullfs
  vnode has its own v_writecount correct, and lower vnode gets all
  references, since object->handle is always lower vnode.
  
  On the text vnode' vm object dealloc, the v_writecount value is reset
  to zero, and deadfs vop_unset_text short-circuit the operation.
  Reclamation of lowervp always reclaims all nullfs vnodes referencing
  lowervp first, so no stray references are left.
  
  Reviewed by:	markj, trasz
  Tested by:	mjg, pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	1 month
  Differential revision:	https://reviews.freebsd.org/D19923

Modified:
  head/libexec/rtld-elf/rtld.c
  head/sys/compat/linux/linux_misc.c
  head/sys/fs/deadfs/dead_vnops.c
  head/sys/fs/nfsclient/nfs_clbio.c
  head/sys/fs/nfsclient/nfs_clvnops.c
  head/sys/fs/nullfs/null_vnops.c
  head/sys/fs/unionfs/union_subr.c
  head/sys/kern/imgact_aout.c
  head/sys/kern/imgact_elf.c
  head/sys/kern/kern_exec.c
  head/sys/kern/vfs_default.c
  head/sys/kern/vfs_subr.c
  head/sys/kern/vfs_vnops.c
  head/sys/kern/vnode_if.src
  head/sys/sys/imgact.h
  head/sys/sys/vnode.h
  head/sys/ufs/ufs/ufs_extattr.c
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_map.c
  head/sys/vm/vm_map.h
  head/sys/vm/vm_mmap.c
  head/sys/vm/vm_object.c
  head/sys/vm/vnode_pager.c

Modified: head/libexec/rtld-elf/rtld.c
==============================================================================
--- head/libexec/rtld-elf/rtld.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/libexec/rtld-elf/rtld.c	Sun May  5 11:20:43 2019	(r347151)
@@ -458,7 +458,7 @@ _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entr
 		 * others x bit is enabled.
 		 * mmap(2) does not allow to mmap with PROT_EXEC if
 		 * binary' file comes from noexec mount.  We cannot
-		 * set VV_TEXT on the binary.
+		 * set a text reference on the binary.
 		 */
 		dir_enable = false;
 		if (st.st_uid == geteuid()) {

Modified: head/sys/compat/linux/linux_misc.c
==============================================================================
--- head/sys/compat/linux/linux_misc.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/compat/linux/linux_misc.c	Sun May  5 11:20:43 2019	(r347151)
@@ -258,13 +258,16 @@ linux_uselib(struct thread *td, struct linux_uselib_ar
 	struct nameidata ni;
 	struct vnode *vp;
 	struct exec *a_out;
+	vm_map_t map;
+	vm_map_entry_t entry;
 	struct vattr attr;
 	vm_offset_t vmaddr;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	char *library;
 	ssize_t aresid;
-	int error, locked, writecount;
+	int error;
+	bool locked, opened, textset;
 
 	LCONVPATHEXIST(td, args->library, &library);
 
@@ -274,8 +277,10 @@ linux_uselib(struct thread *td, struct linux_uselib_ar
 #endif
 
 	a_out = NULL;
-	locked = 0;
 	vp = NULL;
+	locked = false;
+	textset = false;
+	opened = false;
 
 	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, library, td);
@@ -291,17 +296,8 @@ linux_uselib(struct thread *td, struct linux_uselib_ar
 	 * From here on down, we have a locked vnode that must be unlocked.
 	 * XXX: The code below largely duplicates exec_check_permissions().
 	 */
-	locked = 1;
+	locked = true;
 
-	/* Writable? */
-	error = VOP_GET_WRITECOUNT(vp, &writecount);
-	if (error != 0)
-		goto cleanup;
-	if (writecount != 0) {
-		error = ETXTBSY;
-		goto cleanup;
-	}
-
 	/* Executable? */
 	error = VOP_GETATTR(vp, &attr, td->td_ucred);
 	if (error)
@@ -339,6 +335,7 @@ linux_uselib(struct thread *td, struct linux_uselib_ar
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error)
 		goto cleanup;
+	opened = true;
 
 	/* Pull in executable header into exec_map */
 	error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
@@ -401,15 +398,16 @@ linux_uselib(struct thread *td, struct linux_uselib_ar
 
 	/*
 	 * Prevent more writers.
-	 * XXX: Note that if any of the VM operations fail below we don't
-	 * clear this flag.
 	 */
-	VOP_SET_TEXT(vp);
+	error = VOP_SET_TEXT(vp);
+	if (error != 0)
+		goto cleanup;
+	textset = true;
 
 	/*
 	 * Lock no longer needed
 	 */
-	locked = 0;
+	locked = false;
 	VOP_UNLOCK(vp, 0);
 
 	/*
@@ -456,11 +454,21 @@ linux_uselib(struct thread *td, struct linux_uselib_ar
 		 * Map it all into the process's space as a single
 		 * copy-on-write "data" segment.
 		 */
-		error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
+		map = &td->td_proc->p_vmspace->vm_map;
+		error = vm_mmap(map, &vmaddr,
 		    a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 		    MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 		if (error)
 			goto cleanup;
+		vm_map_lock(map);
+		if (!vm_map_lookup_entry(map, vmaddr, &entry)) {
+			vm_map_unlock(map);
+			error = EDOOFUS;
+			goto cleanup;
+		}
+		entry->eflags |= MAP_ENTRY_VN_EXEC;
+		vm_map_unlock(map);
+		textset = false;
 	}
 #ifdef DEBUG
 	printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
@@ -480,7 +488,14 @@ linux_uselib(struct thread *td, struct linux_uselib_ar
 	}
 
 cleanup:
-	/* Unlock vnode if needed */
+	if (opened) {
+		if (locked)
+			VOP_UNLOCK(vp, 0);
+		locked = false;
+		VOP_CLOSE(vp, FREAD, td->td_ucred, td);
+	}
+	if (textset)
+		VOP_UNSET_TEXT_CHECKED(vp);
 	if (locked)
 		VOP_UNLOCK(vp, 0);
 

Modified: head/sys/fs/deadfs/dead_vnops.c
==============================================================================
--- head/sys/fs/deadfs/dead_vnops.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/fs/deadfs/dead_vnops.c	Sun May  5 11:20:43 2019	(r347151)
@@ -47,6 +47,7 @@ static vop_lookup_t	dead_lookup;
 static vop_open_t	dead_open;
 static vop_getwritemount_t dead_getwritemount;
 static vop_rename_t	dead_rename;
+static vop_unset_text_t	dead_unset_text;
 
 struct vop_vector dead_vnodeops = {
 	.vop_default =		&default_vnodeops,
@@ -76,6 +77,7 @@ struct vop_vector dead_vnodeops = {
 	.vop_setattr =		VOP_EBADF,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		VOP_EBADF,
+	.vop_unset_text =	dead_unset_text,
 	.vop_write =		dead_write,
 };
 
@@ -147,4 +149,11 @@ dead_rename(struct vop_rename_args *ap)
 
 	vop_rename_fail(ap);
 	return (EXDEV);
+}
+
+static int
+dead_unset_text(struct vop_unset_text_args *ap)
+{
+
+	return (0);
 }

Modified: head/sys/fs/nfsclient/nfs_clbio.c
==============================================================================
--- head/sys/fs/nfsclient/nfs_clbio.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/fs/nfsclient/nfs_clbio.c	Sun May  5 11:20:43 2019	(r347151)
@@ -1639,7 +1639,7 @@ ncl_doio(struct vnode *vp, struct buf *bp, struct ucre
 		    }
 		}
 		/* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */
-		if (p && (vp->v_vflag & VV_TEXT)) {
+		if (p && vp->v_writecount <= -1) {
 			mtx_lock(&np->n_mtx);
 			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) {
 				mtx_unlock(&np->n_mtx);

Modified: head/sys/fs/nfsclient/nfs_clvnops.c
==============================================================================
--- head/sys/fs/nfsclient/nfs_clvnops.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/fs/nfsclient/nfs_clvnops.c	Sun May  5 11:20:43 2019	(r347151)
@@ -3442,8 +3442,7 @@ nfs_set_text(struct vop_set_text_args *ap)
 	np->n_mtime = np->n_vattr.na_mtime;
 	mtx_unlock(&np->n_mtx);
 
-	vp->v_vflag |= VV_TEXT;
-	return (0);
+	return (vop_stdset_text(ap));
 }
 
 /*

Modified: head/sys/fs/nullfs/null_vnops.c
==============================================================================
--- head/sys/fs/nullfs/null_vnops.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/fs/nullfs/null_vnops.c	Sun May  5 11:20:43 2019	(r347151)
@@ -339,15 +339,15 @@ null_add_writecount(struct vop_add_writecount_args *ap
 
 	vp = ap->a_vp;
 	lvp = NULLVPTOLOWERVP(vp);
-	KASSERT(vp->v_writecount + ap->a_inc >= 0, ("wrong writecount inc"));
-	if (vp->v_writecount > 0 && vp->v_writecount + ap->a_inc == 0)
-		error = VOP_ADD_WRITECOUNT(lvp, -1);
-	else if (vp->v_writecount == 0 && vp->v_writecount + ap->a_inc > 0)
-		error = VOP_ADD_WRITECOUNT(lvp, 1);
-	else
-		error = 0;
+	VI_LOCK(vp);
+	/* text refs are bypassed to lowervp */
+	VNASSERT(vp->v_writecount >= 0, vp, ("wrong null writecount"));
+	VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
+	    ("wrong writecount inc %d", ap->a_inc));
+	error = VOP_ADD_WRITECOUNT(lvp, ap->a_inc);
 	if (error == 0)
 		vp->v_writecount += ap->a_inc;
+	VI_UNLOCK(vp);
 	return (error);
 }
 
@@ -802,15 +802,17 @@ null_reclaim(struct vop_reclaim_args *ap)
 	vp->v_data = NULL;
 	vp->v_object = NULL;
 	vp->v_vnlock = &vp->v_lock;
-	VI_UNLOCK(vp);
 
 	/*
-	 * If we were opened for write, we leased one write reference
+	 * If we were opened for write, we leased the write reference
 	 * to the lower vnode.  If this is a reclamation due to the
 	 * forced unmount, undo the reference now.
 	 */
 	if (vp->v_writecount > 0)
-		VOP_ADD_WRITECOUNT(lowervp, -1);
+		VOP_ADD_WRITECOUNT(lowervp, -vp->v_writecount);
+
+	VI_UNLOCK(vp);
+
 	if ((xp->null_flags & NULLV_NOUNLOCK) != 0)
 		vunref(lowervp);
 	else

Modified: head/sys/fs/unionfs/union_subr.c
==============================================================================
--- head/sys/fs/unionfs/union_subr.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/fs/unionfs/union_subr.c	Sun May  5 11:20:43 2019	(r347151)
@@ -941,10 +941,14 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct 
 		vput(vp);
 		goto unionfs_vn_create_on_upper_free_out1;
 	}
-	VOP_ADD_WRITECOUNT(vp, 1);
+	error = VOP_ADD_WRITECOUNT(vp, 1);
 	CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",  __func__, vp,
 	    vp->v_writecount);
-	*vpp = vp;
+	if (error == 0) {
+		*vpp = vp;
+	} else {
+		VOP_CLOSE(vp, fmode, cred, td);
+	}
 
 unionfs_vn_create_on_upper_free_out1:
 	VOP_UNLOCK(udvp, LK_RELEASE);
@@ -1078,7 +1082,7 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy,
 		}
 	}
 	VOP_CLOSE(uvp, FWRITE, cred, td);
-	VOP_ADD_WRITECOUNT(uvp, -1);
+	VOP_ADD_WRITECOUNT_CHECKED(uvp, -1);
 	CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, uvp,
 	    uvp->v_writecount);
 

Modified: head/sys/kern/imgact_aout.c
==============================================================================
--- head/sys/kern/imgact_aout.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/kern/imgact_aout.c	Sun May  5 11:20:43 2019	(r347151)
@@ -247,8 +247,8 @@ exec_aout_imgact(struct image_params *imgp)
 	    /* data + bss can't exceed rlimit */
 	    a_out->a_data + bss_size > lim_cur_proc(imgp->proc, RLIMIT_DATA) ||
 	    racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) {
-			PROC_UNLOCK(imgp->proc);
-			return (ENOMEM);
+		PROC_UNLOCK(imgp->proc);
+		return (ENOMEM);
 	}
 	PROC_UNLOCK(imgp->proc);
 
@@ -267,7 +267,7 @@ exec_aout_imgact(struct image_params *imgp)
 	 */
 	error = exec_new_vmspace(imgp, &aout_sysvec);
 
-	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	if (error)
 		return (error);
 
@@ -286,12 +286,13 @@ exec_aout_imgact(struct image_params *imgp)
 		file_offset,
 		virtual_offset, text_end,
 		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
-		MAP_COPY_ON_WRITE | MAP_PREFAULT);
+		MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC);
 	if (error) {
 		vm_map_unlock(map);
 		vm_object_deallocate(object);
 		return (error);
 	}
+	VOP_SET_TEXT_CHECKED(imgp->vp);
 	data_end = text_end + a_out->a_data;
 	if (a_out->a_data) {
 		vm_object_reference(object);
@@ -299,12 +300,13 @@ exec_aout_imgact(struct image_params *imgp)
 			file_offset + a_out->a_text,
 			text_end, data_end,
 			VM_PROT_ALL, VM_PROT_ALL,
-			MAP_COPY_ON_WRITE | MAP_PREFAULT);
+			MAP_COPY_ON_WRITE | MAP_PREFAULT | MAP_VN_EXEC);
 		if (error) {
 			vm_map_unlock(map);
 			vm_object_deallocate(object);
 			return (error);
 		}
+		VOP_SET_TEXT_CHECKED(imgp->vp);
 	}
 
 	if (bss_size) {

Modified: head/sys/kern/imgact_elf.c
==============================================================================
--- head/sys/kern/imgact_elf.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/kern/imgact_elf.c	Sun May  5 11:20:43 2019	(r347151)
@@ -526,13 +526,17 @@ __elfN(map_insert)(struct image_params *imgp, vm_map_t
 	} else {
 		vm_object_reference(object);
 		rv = vm_map_fixed(map, object, offset, start, end - start,
-		    prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL);
+		    prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL |
+		    (object != NULL ? MAP_VN_EXEC : 0));
 		if (rv != KERN_SUCCESS) {
 			locked = VOP_ISLOCKED(imgp->vp);
 			VOP_UNLOCK(imgp->vp, 0);
 			vm_object_deallocate(object);
 			vn_lock(imgp->vp, locked | LK_RETRY);
 			return (rv);
+		} else if (object != NULL) {
+			MPASS(imgp->vp->v_object == object);
+			VOP_SET_TEXT_CHECKED(imgp->vp);
 		}
 	}
 	return (KERN_SUCCESS);
@@ -589,13 +593,8 @@ __elfN(load_section)(struct image_params *imgp, vm_oof
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
-		rv = __elfN(map_insert)(imgp, map,
-				      object,
-				      file_addr,	/* file offset */
-				      map_addr,		/* virtual start */
-				      map_addr + map_len,/* virtual end */
-				      prot,
-				      cow);
+		rv = __elfN(map_insert)(imgp, map, object, file_addr,
+		    map_addr, map_addr + map_len, prot, cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
@@ -716,7 +715,7 @@ __elfN(load_file)(struct proc *p, const char *file, u_
 	struct nameidata *nd;
 	struct vattr *attr;
 	struct image_params *imgp;
-	u_long flags, rbase;
+	u_long rbase;
 	u_long base_addr = 0;
 	int error;
 
@@ -744,10 +743,8 @@ __elfN(load_file)(struct proc *p, const char *file, u_
 	imgp->object = NULL;
 	imgp->execlabel = NULL;
 
-	flags = FOLLOW | LOCKSHARED | LOCKLEAF;
-
-again:
-	NDINIT(nd, LOOKUP, flags, UIO_SYSSPACE, file, curthread);
+	NDINIT(nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, file,
+	    curthread);
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
@@ -762,27 +759,6 @@ again:
 	if (error)
 		goto fail;
 
-	/*
-	 * Also make certain that the interpreter stays the same,
-	 * so set its VV_TEXT flag, too.  Since this function is only
-	 * used to load the interpreter, the VV_TEXT is almost always
-	 * already set.
-	 */
-	if (VOP_IS_TEXT(nd->ni_vp) == 0) {
-		if (VOP_ISLOCKED(nd->ni_vp) != LK_EXCLUSIVE) {
-			/*
-			 * LK_UPGRADE could have resulted in dropping
-			 * the lock.  Just try again from the start,
-			 * this time with exclusive vnode lock.
-			 */
-			vput(nd->ni_vp);
-			flags &= ~LOCKSHARED;
-			goto again;
-		}
-
-		VOP_SET_TEXT(nd->ni_vp);
-	}
-
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
@@ -825,9 +801,11 @@ fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
-	if (nd->ni_vp)
+	if (nd->ni_vp) {
+		if (imgp->textset)
+			VOP_UNSET_TEXT_CHECKED(nd->ni_vp);
 		vput(nd->ni_vp);
-
+	}
 	free(tempdata, M_TEMP);
 
 	return (error);
@@ -961,7 +939,7 @@ __elfN(get_interp)(struct image_params *imgp, const El
 		if (interp == NULL) {
 			VOP_UNLOCK(imgp->vp, 0);
 			interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK);
-			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+			vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		}
 		error = vn_rdwr(UIO_READ, imgp->vp, interp,
 		    interp_name_len, phdr->p_offset,
@@ -1228,7 +1206,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *i
 		    maxv / 2, 1UL << flsl(maxalign));
 	}
 
-	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	if (error != 0)
 		goto ret;
 
@@ -1272,7 +1250,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *i
 		}
 		error = __elfN(load_interp)(imgp, brand_info, interp, &addr,
 		    &imgp->entry_addr);
-		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
 			goto ret;
 	} else
@@ -1285,7 +1263,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *i
 	if (elf_auxargs == NULL) {
 		VOP_UNLOCK(imgp->vp, 0);
 		elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
-		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	}
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr + et_dyn_addr;
@@ -2570,7 +2548,7 @@ __elfN(parse_notes)(struct image_params *imgp, Elf_Not
 		if (buf == NULL) {
 			VOP_UNLOCK(imgp->vp, 0);
 			buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK);
-			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+			vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		}
 		error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz,
 		    pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED,

Modified: head/sys/kern/kern_exec.c
==============================================================================
--- head/sys/kern/kern_exec.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/kern/kern_exec.c	Sun May  5 11:20:43 2019	(r347151)
@@ -375,7 +375,6 @@ do_execve(struct thread *td, struct image_args *args, 
 #endif
 	struct vnode *oldtextvp = NULL, *newtextvp;
 	int credential_changing;
-	int textset;
 #ifdef MAC
 	struct label *interpvplabel = NULL;
 	int will_transition;
@@ -423,8 +422,8 @@ do_execve(struct thread *td, struct image_args *args, 
 	 * interpreter if this is an interpreted binary.
 	 */
 	if (args->fname != NULL) {
-		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
-		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
+		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW |
+		    SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 	}
 
 	SDT_PROBE1(proc, , , exec, args->fname);
@@ -457,13 +456,14 @@ interpret:
 		error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp);
 		if (error)
 			goto exec_fail;
-		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(newtextvp, LK_SHARED | LK_RETRY);
 		AUDIT_ARG_VNODE1(newtextvp);
 		imgp->vp = newtextvp;
 	}
 
 	/*
-	 * Check file permissions (also 'opens' file)
+	 * Check file permissions.  Also 'opens' file and sets its vnode to
+	 * text mode.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
@@ -473,16 +473,6 @@ interpret:
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
-	/*
-	 * Set VV_TEXT now so no one can write to the executable while we're
-	 * activating it.
-	 *
-	 * Remember if this was set before and unset it in case this is not
-	 * actually an executable image.
-	 */
-	textset = VOP_IS_TEXT(imgp->vp);
-	VOP_SET_TEXT(imgp->vp);
-
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
@@ -610,11 +600,8 @@ interpret:
 	}
 
 	if (error) {
-		if (error == -1) {
-			if (textset == 0)
-				VOP_UNSET_TEXT(imgp->vp);
+		if (error == -1)
 			error = ENOEXEC;
-		}
 		goto exec_fail_dealloc;
 	}
 
@@ -625,12 +612,13 @@ interpret:
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
-		 * VV_TEXT needs to be unset for scripts.  There is a short
-		 * period before we determine that something is a script where
-		 * VV_TEXT will be set. The vnode lock is held over this
-		 * entire period so nothing should illegitimately be blocked.
+		 * The text reference needs to be removed for scripts.
+		 * There is a short period before we determine that
+		 * something is a script where text reference is active.
+		 * The vnode lock is held over this entire period
+		 * so nothing should illegitimately be blocked.
 		 */
-		VOP_UNSET_TEXT(imgp->vp);
+		VOP_UNSET_TEXT_CHECKED(imgp->vp);
 		/* free name buffer and old vnode */
 		if (args->fname != NULL)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -886,6 +874,8 @@ exec_fail_dealloc:
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (imgp->opened)
 			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
+		if (imgp->textset)
+			VOP_UNSET_TEXT_CHECKED(imgp->vp);
 		if (error != 0)
 			vput(imgp->vp);
 		else
@@ -1706,7 +1696,7 @@ exec_check_permissions(struct image_params *imgp)
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
-	int error, writecount;
+	int error;
 
 	td = curthread;
 
@@ -1750,12 +1740,17 @@ exec_check_permissions(struct image_params *imgp)
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
+	 *
+	 * Add a text reference now so no one can write to the
+	 * executable while we're activating it.
+	 *
+	 * Remember if this was set before and unset it in case this is not
+	 * actually an executable image.
 	 */
-	error = VOP_GET_WRITECOUNT(vp, &writecount);
+	error = VOP_SET_TEXT(vp);
 	if (error != 0)
 		return (error);
-	if (writecount != 0)
-		return (ETXTBSY);
+	imgp->textset = true;
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the

Modified: head/sys/kern/vfs_default.c
==============================================================================
--- head/sys/kern/vfs_default.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/kern/vfs_default.c	Sun May  5 11:20:43 2019	(r347151)
@@ -81,9 +81,7 @@ static int	dirent_exists(struct vnode *vp, const char 
 #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
 
 static int vop_stdis_text(struct vop_is_text_args *ap);
-static int vop_stdset_text(struct vop_set_text_args *ap);
 static int vop_stdunset_text(struct vop_unset_text_args *ap);
-static int vop_stdget_writecount(struct vop_get_writecount_args *ap);
 static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
 static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
 static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
@@ -141,7 +139,6 @@ struct vop_vector default_vnodeops = {
 	.vop_is_text =		vop_stdis_text,
 	.vop_set_text =		vop_stdset_text,
 	.vop_unset_text =	vop_stdunset_text,
-	.vop_get_writecount =	vop_stdget_writecount,
 	.vop_add_writecount =	vop_stdadd_writecount,
 };
 
@@ -1070,39 +1067,63 @@ static int
 vop_stdis_text(struct vop_is_text_args *ap)
 {
 
-	return ((ap->a_vp->v_vflag & VV_TEXT) != 0);
+	return (ap->a_vp->v_writecount < 0);
 }
 
-static int
+int
 vop_stdset_text(struct vop_set_text_args *ap)
 {
+	struct vnode *vp;
+	int error;
 
-	ap->a_vp->v_vflag |= VV_TEXT;
-	return (0);
+	vp = ap->a_vp;
+	VI_LOCK(vp);
+	if (vp->v_writecount > 0) {
+		error = ETXTBSY;
+	} else {
+		vp->v_writecount--;
+		error = 0;
+	}
+	VI_UNLOCK(vp);
+	return (error);
 }
 
 static int
 vop_stdunset_text(struct vop_unset_text_args *ap)
 {
+	struct vnode *vp;
+	int error;
 
-	ap->a_vp->v_vflag &= ~VV_TEXT;
-	return (0);
+	vp = ap->a_vp;
+	VI_LOCK(vp);
+	if (vp->v_writecount < 0) {
+		vp->v_writecount++;
+		error = 0;
+	} else {
+		error = EINVAL;
+	}
+	VI_UNLOCK(vp);
+	return (error);
 }
 
 static int
-vop_stdget_writecount(struct vop_get_writecount_args *ap)
-{
-
-	*ap->a_writecount = ap->a_vp->v_writecount;
-	return (0);
-}
-
-static int
 vop_stdadd_writecount(struct vop_add_writecount_args *ap)
 {
+	struct vnode *vp;
+	int error;
 
-	ap->a_vp->v_writecount += ap->a_inc;
-	return (0);
+	vp = ap->a_vp;
+	VI_LOCK(vp);
+	if (vp->v_writecount < 0) {
+		error = ETXTBSY;
+	} else {
+		VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
+		    ("neg writecount increment %d", ap->a_inc));
+		vp->v_writecount += ap->a_inc;
+		error = 0;
+	}
+	VI_UNLOCK(vp);
+	return (error);
 }
 
 /*

Modified: head/sys/kern/vfs_subr.c
==============================================================================
--- head/sys/kern/vfs_subr.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/kern/vfs_subr.c	Sun May  5 11:20:43 2019	(r347151)
@@ -3491,8 +3491,6 @@ vn_printf(struct vnode *vp, const char *fmt, ...)
 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
-	if (vp->v_vflag & VV_TEXT)
-		strlcat(buf, "|VV_TEXT", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
@@ -3508,7 +3506,7 @@ vn_printf(struct vnode *vp, const char *fmt, ...)
 	if (vp->v_vflag & VV_FORCEINSMQ)
 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
-	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
+	    VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);

Modified: head/sys/kern/vfs_vnops.c
==============================================================================
--- head/sys/kern/vfs_vnops.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/kern/vfs_vnops.c	Sun May  5 11:20:43 2019	(r347151)
@@ -294,6 +294,39 @@ bad:
 	return (error);
 }
 
+static int
+vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
+{
+	struct flock lf;
+	int error, lock_flags, type;
+
+	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
+	if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
+		return (0);
+	KASSERT(fp != NULL, ("open with flock requires fp"));
+	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
+		return (EOPNOTSUPP);
+
+	lock_flags = VOP_ISLOCKED(vp);
+	VOP_UNLOCK(vp, 0);
+
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
+	type = F_FLOCK;
+	if ((fmode & FNONBLOCK) == 0)
+		type |= F_WAIT;
+	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
+	if (error == 0)
+		fp->f_flag |= FHASLOCK;
+
+	vn_lock(vp, lock_flags | LK_RETRY);
+	if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
+		error = ENOENT;
+	return (error);
+}
+
 /*
  * Common code for vnode open operations once a vnode is located.
  * Check permissions, and call the VOP_OPEN routine.
@@ -303,8 +336,7 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucre
     struct thread *td, struct file *fp)
 {
 	accmode_t accmode;
-	struct flock lf;
-	int error, lock_flags, type;
+	int error;
 
 	if (vp->v_type == VLNK)
 		return (EMLINK);
@@ -335,63 +367,31 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucre
 
 	accmode &= ~(VCREAT | VVERIFY);
 #endif
-	if ((fmode & O_CREAT) == 0) {
-		if (accmode & VWRITE) {
-			error = vn_writechk(vp);
-			if (error)
-				return (error);
-		}
-		if (accmode) {
-		        error = VOP_ACCESS(vp, accmode, cred, td);
-			if (error)
-				return (error);
-		}
+	if ((fmode & O_CREAT) == 0 && accmode != 0) {
+		error = VOP_ACCESS(vp, accmode, cred, td);
+		if (error != 0)
+			return (error);
 	}
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
-	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
+	error = VOP_OPEN(vp, fmode, cred, td, fp);
+	if (error != 0)
 		return (error);
 
-	while ((fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
-		KASSERT(fp != NULL, ("open with flock requires fp"));
-		if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) {
-			error = EOPNOTSUPP;
-			break;
+	error = vn_open_vnode_advlock(vp, fmode, fp);
+	if (error == 0 && (fmode & FWRITE) != 0) {
+		error = VOP_ADD_WRITECOUNT(vp, 1);
+		if (error == 0) {
+			CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
+			     __func__, vp, vp->v_writecount);
 		}
-		lock_flags = VOP_ISLOCKED(vp);
-		VOP_UNLOCK(vp, 0);
-		lf.l_whence = SEEK_SET;
-		lf.l_start = 0;
-		lf.l_len = 0;
-		if (fmode & O_EXLOCK)
-			lf.l_type = F_WRLCK;
-		else
-			lf.l_type = F_RDLCK;
-		type = F_FLOCK;
-		if ((fmode & FNONBLOCK) == 0)
-			type |= F_WAIT;
-		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
-		if (error == 0)
-			fp->f_flag |= FHASLOCK;
-		vn_lock(vp, lock_flags | LK_RETRY);
-		if (error != 0)
-			break;
-		if ((vp->v_iflag & VI_DOOMED) != 0) {
-			error = ENOENT;
-			break;
-		}
-
-		/*
-		 * Another thread might have used this vnode as an
-		 * executable while the vnode lock was dropped.
-		 * Ensure the vnode is still able to be opened for
-		 * writing after the lock has been obtained.
-		 */
-		if ((accmode & VWRITE) != 0)
-			error = vn_writechk(vp);
-		break;
 	}
 
+	/*
+	 * Error from advlock or VOP_ADD_WRITECOUNT() still requires
+	 * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
+	 * Arrange for that by having fdrop() to use vn_closefile().
+	 */
 	if (error != 0) {
 		fp->f_flag |= FOPENFAILED;
 		fp->f_vnode = vp;
@@ -400,18 +400,17 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucre
 			fp->f_ops = &vnops;
 		}
 		vref(vp);
-	} else if  ((fmode & FWRITE) != 0) {
-		VOP_ADD_WRITECOUNT(vp, 1);
-		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
-		    __func__, vp, vp->v_writecount);
 	}
+
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (error);
+
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
+ * It is racy.
  */
 int
 vn_writechk(struct vnode *vp)
@@ -449,9 +448,7 @@ vn_close1(struct vnode *vp, int flags, struct ucred *f
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
-		VNASSERT(vp->v_writecount > 0, vp, 
-		    ("vn_close: negative writecount"));
-		VOP_ADD_WRITECOUNT(vp, -1);
+		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
@@ -1319,13 +1316,14 @@ vn_truncate(struct file *fp, off_t length, struct ucre
 	if (error)
 		goto out;
 #endif
-	error = vn_writechk(vp);
+	error = VOP_ADD_WRITECOUNT(vp, 1);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		if ((fp->f_flag & O_FSYNC) != 0)
 			vattr.va_vaflags |= VA_SYNC;
 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
+		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 	}
 out:
 	VOP_UNLOCK(vp, 0);

Modified: head/sys/kern/vnode_if.src
==============================================================================
--- head/sys/kern/vnode_if.src	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/kern/vnode_if.src	Sun May  5 11:20:43 2019	(r347151)
@@ -688,29 +688,21 @@ vop_is_text {
 };
 
 
-%% set_text	vp	E E E
+%% set_text	vp	= = =
 
 vop_set_text {
 	IN struct vnode *vp;
 };
 
 
-%% vop_unset_text	vp	E E E
+%% vop_unset_text	vp	= = =
 
 vop_unset_text {
 	IN struct vnode *vp;
 };
 
 
-%% get_writecount	vp	L L L
-
-vop_get_writecount {
-	IN struct vnode *vp;
-	OUT int *writecount;
-};
-
-
-%% add_writecount	vp	E E E
+%% add_writecount	vp	L L L
 
 vop_add_writecount {
 	IN struct vnode *vp;

Modified: head/sys/sys/imgact.h
==============================================================================
--- head/sys/sys/imgact.h	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/sys/imgact.h	Sun May  5 11:20:43 2019	(r347151)
@@ -89,6 +89,7 @@ struct image_params {
 	u_long stack_sz;
 	struct ucred *newcred;		/* new credentials if changing */
 	bool credential_setid;		/* true if becoming setid */
+	bool textset;
 	u_int map_flags;
 };
 

Modified: head/sys/sys/vnode.h
==============================================================================
--- head/sys/sys/vnode.h	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/sys/vnode.h	Sun May  5 11:20:43 2019	(r347151)
@@ -169,7 +169,8 @@ struct vnode {
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
 	u_int	v_mflag;			/* l mnt-specific vnode flags */
-	int	v_writecount;			/* v ref count of writers */
+	int	v_writecount;			/* I ref count of writers or
+						   (negative) text users */
 	u_int	v_hash;
 	enum	vtype v_type;			/* u vnode type */
 };
@@ -244,7 +245,6 @@ struct xvnode {
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
-#define	VV_TEXT		0x0020	/* vnode is a pure text prototype */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
@@ -749,6 +749,7 @@ int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
+int	vop_stdset_text(struct vop_set_text_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
@@ -828,6 +829,33 @@ void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
+#ifdef	INVARIANTS
+#define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)				\
+do {									\
+	int error_;							\
+									\
+	error_ = VOP_ADD_WRITECOUNT((vp), (cnt));			\
+	MPASS(error_ == 0);						\
+} while (0)
+#define	VOP_SET_TEXT_CHECKED(vp)					\
+do {									\
+	int error_;							\
+									\
+	error_ = VOP_SET_TEXT((vp));					\
+	MPASS(error_ == 0);						\
+} while (0)
+#define	VOP_UNSET_TEXT_CHECKED(vp)					\
+do {									\
+	int error_;							\
+									\
+	error_ = VOP_UNSET_TEXT((vp));					\
+	MPASS(error_ == 0);						\
+} while (0)
+#else
+#define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)	VOP_ADD_WRITECOUNT((vp), (cnt))
+#define	VOP_SET_TEXT_CHECKED(vp)		VOP_SET_TEXT((vp))
+#define	VOP_UNSET_TEXT_CHECKED(vp)		VOP_UNSET_TEXT((vp))
+#endif
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);

Modified: head/sys/ufs/ufs/ufs_extattr.c
==============================================================================
--- head/sys/ufs/ufs/ufs_extattr.c	Sun May  5 11:06:19 2019	(r347150)
+++ head/sys/ufs/ufs/ufs_extattr.c	Sun May  5 11:20:43 2019	(r347151)
@@ -338,7 +338,12 @@ ufs_extattr_enable_with_open(struct ufsmount *ump, str
 		return (error);
 	}
 
-	VOP_ADD_WRITECOUNT(vp, 1);
+	error = VOP_ADD_WRITECOUNT(vp, 1);
+	if (error != 0) {
+		VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list