svn commit: r267760 - head/sys/kern

Tue Jul 7 09:20:35 UTC 2015

On Tue, Jul 07, 2015 at 12:42:01AM +0200, Mateusz Guzik wrote:
> On Mon, Jul 06, 2015 at 12:09:58PM +0300, Konstantin Belousov wrote:
> > On Mon, Jul 06, 2015 at 08:46:02AM +0200, Mateusz Guzik wrote:
> > > diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
> > > index fe8e9ef..c7f579a 100644
> > > --- a/sys/fs/devfs/devfs_vnops.c
> > > +++ b/sys/fs/devfs/devfs_vnops.c
> > > @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap)
> > >  	if (vp->v_data == NULL)
> > >  		return (0);
> > >  
> > > +	vp_locked = VOP_ISLOCKED(vp);
> > > +
> > >  	/*
> > >  	 * Hack: a tty device that is a controlling terminal
> > >  	 * has a reference from the session structure.
> > > @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap)
> > >  		if (vp == p->p_session->s_ttyvp) {
> > >  			PROC_UNLOCK(p);
> > >  			oldvp = NULL;
> > > +			VOP_UNLOCK(vp, 0);
> > This opens a window where vp can be reclaimed. Then vp->v_rdev
> > is invalid and dev_refthread() below accesses random memory.
> > 
> > Might be, the easiest fix is to call dev_refthread() before the td !=
> > NULL block, but leave dsw == NULL check where it is now.
> > 
> 
> Instead I made the bare minimum to ensure that modified fields which are
> accessed here are always protected with proc lock and sess lock, which
> eliminates the need for proctree lock in problematic cases. In effect
> there are no games with relocking vnodes.
This could work.  What I want to know is what methodology was used to
ensure that added locking for other places is enough.

The proctree locking is 'natural', since session/group membership relates
to the process tree structure and owning the sx guarantees that the tree
is stable.  Your locking can work, but needs an argumentation why it is
complete.  Also, you should add some note to proc.h to indicate in the
locking annotations that reading of e.g. pg_session is safe when only
process lock is owned.

> 
> I would commit this patch separately.
> 
> Note this establishes a process lock -> vnode interlock order which
> seems fishy, but apparently does not result in lors.
> 
> diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
> index fe8e9ef..0a2a6bb 100644
> --- a/sys/fs/devfs/devfs_vnops.c
> +++ b/sys/fs/devfs/devfs_vnops.c
> @@ -586,12 +586,12 @@ devfs_close(struct vop_close_args *ap)
>  	if (td != NULL) {
>  		p = td->td_proc;
>  		PROC_LOCK(p);
> -		if (vp == p->p_session->s_ttyvp) {
> +		if (vp != p->p_session->s_ttyvp) {
>  			PROC_UNLOCK(p);
> +		} else {
>  			oldvp = NULL;
> -			sx_xlock(&proctree_lock);
> +			SESS_LOCK(p->p_session);
>  			if (vp == p->p_session->s_ttyvp) {
> -				SESS_LOCK(p->p_session);
>  				VI_LOCK(vp);
>  				if (count_dev(dev) == 2 &&
>  				    (vp->v_iflag & VI_DOOMED) == 0) {
> @@ -600,13 +600,12 @@ devfs_close(struct vop_close_args *ap)
>  					oldvp = vp;
>  				}
>  				VI_UNLOCK(vp);
> -				SESS_UNLOCK(p->p_session);
>  			}
> -			sx_xunlock(&proctree_lock);
> +			SESS_UNLOCK(p->p_session);
> +			PROC_UNLOCK(p);
>  			if (oldvp != NULL)
>  				vrele(oldvp);
> -		} else
> -			PROC_UNLOCK(p);
> +		}
>  	}
>  	/*
>  	 * We do not want to really close the device if it
> diff --git a/sys/kern/tty.c b/sys/kern/tty.c
> index d9d0cce..5ef5578 100644
> --- a/sys/kern/tty.c
> +++ b/sys/kern/tty.c
> @@ -1637,8 +1637,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
>  			return (EPERM);
>  		}
>  
> +		PROC_LOCK(p);
> +		SESS_LOCK(p->p_session);
> +
>  		if (tp->t_session != NULL && tp->t_session == p->p_session) {
>  			/* This is already our controlling TTY. */
> +			SESS_UNLOCK(p->p_session);
> +			PROC_UNLOCK(p);
>  			sx_xunlock(&proctree_lock);
>  			return (0);
>  		}
> @@ -1657,6 +1662,8 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
>  			 * TTYs of which the session leader has been
>  			 * killed or the TTY revoked.
>  			 */
> +			SESS_UNLOCK(p->p_session);
> +			PROC_UNLOCK(p);
>  			sx_xunlock(&proctree_lock);
>  			return (EPERM);
>  		}
> @@ -1665,14 +1672,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
>  		tp->t_session = p->p_session;
>  		tp->t_session->s_ttyp = tp;
>  		tp->t_sessioncnt++;
> -		sx_xunlock(&proctree_lock);
>  
>  		/* Assign foreground process group. */
>  		tp->t_pgrp = p->p_pgrp;
> -		PROC_LOCK(p);
>  		p->p_flag |= P_CONTROLT;
> +		SESS_UNLOCK(p->p_session);
>  		PROC_UNLOCK(p);
> -
> +		sx_xunlock(&proctree_lock);
>  		return (0);
>  	}
>  	case TIOCSPGRP: {
> diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
> index 582b41a..dca619c 100644
> --- a/sys/kern/tty_tty.c
> +++ b/sys/kern/tty_tty.c
> @@ -65,9 +65,8 @@ ctty_clone(void *arg, struct ucred *cred, char *name, int namelen,
>  	if (strcmp(name, "tty"))
>  		return;
>  	p = curproc;
> -	sx_sunlock(&clone_drain_lock);
> -	sx_slock(&proctree_lock);
> -	sx_slock(&clone_drain_lock);
> +	PROC_LOCK(p);
> +	SESS_LOCK(p->p_session);
>  	dev_lock();
>  	if (!(p->p_flag & P_CONTROLT))
>  		*dev = ctty;
> @@ -81,7 +80,8 @@ ctty_clone(void *arg, struct ucred *cred, char *name, int namelen,
>  		*dev = p->p_session->s_ttyvp->v_rdev;
>  	dev_refl(*dev);
>  	dev_unlock();
> -	sx_sunlock(&proctree_lock);
> +	SESS_UNLOCK(p->p_session);
> +	PROC_UNLOCK(p);
>  }
>  
>  static void
> > > diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
> > > index 85cda01..bbdcbf5 100644
> > > --- a/sys/kern/kern_descrip.c
> > > +++ b/sys/kern/kern_descrip.c
> > > @@ -3271,6 +3271,7 @@ kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
> > >  	FILEDESC_SUNLOCK(fdp);
> > >  	fddrop(fdp);
> > >  fail:
> > > +	sx_sunlock(&p->p_imagelock);
> > IMO it would be less confusing to unlock the p_imagelock at the caller'
> > location. At the kern_proc_filedesc_out(), the p_imagelock slocked state
> > must be asserted.
> > 
> 
> Moved.
> 
> > Same for ofiledesc().
> > 
> 
> This is the top level func which also has the relevant pget call.
> 
> > > diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
> > > index 9ce6d34..27fd764 100644
> > > --- a/sys/kern/kern_exit.c
> > > +++ b/sys/kern/kern_exit.c
> > > @@ -213,6 +213,7 @@ exit1(struct thread *td, int rv)
> > >  	/*
> > >  	 * MUST abort all other threads before proceeding past here.
> > >  	 */
> > > +	sx_xlock(&p->p_imagelock);
> > >  	PROC_LOCK(p);
> > >  	/*
> > >  	 * First check if some other thread or external request got
> > > @@ -279,6 +280,7 @@ exit1(struct thread *td, int rv)
> > >  	 * decided to wait again after we told them we are exiting.
> > >  	 */
> > >  	p->p_flag |= P_WEXIT;
> > > +	sx_xunlock(&p->p_imagelock);
> > I do not understand why p_imagelock is released so early in the exit1().
> > 
> 
> Since the process catched with p_imagelock must not be exiting, a
> lock/unlock sequence paired with setting P_WEXIT gives us a guarantee
> that nothing will start inspeting it.
> 
> Past P_WEXIT there are next to no guarantees for stability of most
> fields of the proc anyway, and soon after proctree lock is taken and we
> have to drop imagelock by that time to avoid a lor.
So there is a guarantee that after P_WEXIT is set, pget() in the kern.proc.XXX
would always fail due to P_NOTWEXIT flag being always present ?

> 
> > >  	wakeup(&p->p_stype);
> > >  
> > >  	/*
> > > diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
> > > index 27c6f40..ac6b441 100644
> > > --- a/sys/kern/kern_proc.c
> > > +++ b/sys/kern/kern_proc.c
> > > @@ -230,6 +230,7 @@ proc_init(void *mem, int size, int flags)
> > >  	mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN | MTX_NEW);
> > >  	mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN | MTX_NEW);
> > >  	mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN | MTX_NEW);
> > > +	sx_init_flags(&p->p_imagelock, "process imagelock", SX_NEW);
> > >  	cv_init(&p->p_pwait, "ppwait");
> > >  	cv_init(&p->p_dbgwait, "dbgwait");
> > >  	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
> > > @@ -278,16 +279,20 @@ inferior(struct proc *p)
> > >  }
> > >  
> > >  struct proc *
> > > -pfind_locked(pid_t pid)
> > > +pfind_locked(pid_t pid, int lockimage)
> > >  {
> > >  	struct proc *p;
> > >  
> > >  	sx_assert(&allproc_lock, SX_LOCKED);
> > >  	LIST_FOREACH(p, PIDHASH(pid), p_hash) {
> > >  		if (p->p_pid == pid) {
> > > +			if (lockimage)
> > > +				sx_slock(&p->p_imagelock);
> > Probably makes sense to explicitely add imagelock name into the
> > witness' order_lists[] array.
> > 
> 
> Added after allprison.
> 
> As a cosmetic matter I'm pondering a pair like pimage_lock/unlock or
> similar to wrap relevant pget and unlock of p_imagelock.
> 
> Note that in this patch I moved exit's imagelock to after
> singlethreading is in effect.
> 
> diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c
> index d3bac30..118a7d9 100644
> --- a/sys/fs/nfsclient/nfs_clport.c
> +++ b/sys/fs/nfsclient/nfs_clport.c
> @@ -1188,7 +1188,7 @@ nfscl_procdoesntexist(u_int8_t *own)
>  	tl.cval[2] = *own++;
>  	tl.cval[3] = *own++;
>  	pid = tl.lval;
> -	p = pfind_locked(pid);
> +	p = pfind_locked(pid, 0);
>  	if (p == NULL)
>  		return (1);
>  	if (p->p_stats == NULL) {
> diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
> index 0675128..2ad09aa 100644
> --- a/sys/kern/imgact_elf.c
> +++ b/sys/kern/imgact_elf.c
> @@ -1887,6 +1887,7 @@ note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
>  		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
>  		sbuf_set_drain(sb, sbuf_drain_count, &size);
>  		sbuf_bcat(sb, &structsize, sizeof(structsize));
> +		sx_slock(&p->p_imagelock);
>  		PROC_LOCK(p);
>  		kern_proc_filedesc_out(p, sb, -1);
>  		sbuf_finish(sb);
> @@ -1895,6 +1896,7 @@ note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
>  	} else {
>  		structsize = sizeof(struct kinfo_file);
>  		sbuf_bcat(sb, &structsize, sizeof(structsize));
> +		sx_slock(&p->p_imagelock);
>  		PROC_LOCK(p);
>  		kern_proc_filedesc_out(p, sb, -1);
>  	}
> diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
> index 37539c4..9364dcb 100644
> --- a/sys/kern/init_main.c
> +++ b/sys/kern/init_main.c
> @@ -480,6 +480,7 @@ proc0_init(void *dummy __unused)
>  	p->p_flag2 = 0;
>  	p->p_state = PRS_NORMAL;
>  	knlist_init_mtx(&p->p_klist, &p->p_mtx);
> +	sx_init_flags(&p->p_imagelock, "process imagelock", SX_NEW);
>  	STAILQ_INIT(&p->p_ktr);
>  	p->p_nice = NZERO;
>  	/* pid_max cannot be greater than PID_MAX */
> diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
> index 85cda01..f0f4665 100644
> --- a/sys/kern/kern_descrip.c
> +++ b/sys/kern/kern_descrip.c
> @@ -3292,13 +3292,14 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
>  
>  	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
>  	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
> -	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
> +	error = pget((pid_t)name[0], PGET_LOCK, &p);
>  	if (error != 0) {
>  		sbuf_delete(&sb);
>  		return (error);
>  	}
>  	maxlen = req->oldptr != NULL ? req->oldlen : -1;
>  	error = kern_proc_filedesc_out(p, &sb, maxlen);
> +	sx_sunlock(&p->p_imagelock);
>  	error2 = sbuf_finish(&sb);
>  	sbuf_delete(&sb);
>  	return (error != 0 ? error : error2);
> @@ -3359,7 +3360,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
>  	struct proc *p;
>  
>  	name = (int *)arg1;
> -	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
> +	error = pget((pid_t)name[0], PGET_LOCK, &p);
>  	if (error != 0)
>  		return (error);
>  	fdp = fdhold(p);
> @@ -3391,6 +3392,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
>  	}
>  	FILEDESC_SUNLOCK(fdp);
>  	fddrop(fdp);
> +	sx_sunlock(&p->p_imagelock);
>  	free(kif, M_TEMP);
>  	free(okif, M_TEMP);
>  	return (0);
> diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
> index 859b2e3..0d9aac1 100644
> --- a/sys/kern/kern_exec.c
> +++ b/sys/kern/kern_exec.c
> @@ -381,6 +381,7 @@ do_execve(td, args, mac_p)
>  	 * that might allow a local user to illicitly obtain elevated
>  	 * privileges.
>  	 */
> +	sx_xlock(&p->p_imagelock);
>  	PROC_LOCK(p);
>  	KASSERT((p->p_flag & P_INEXEC) == 0,
>  	    ("%s(): process already has P_INEXEC flag", __func__));
> @@ -907,6 +908,7 @@ exec_fail:
>  	SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
>  
>  done2:
> +	sx_xunlock(&p->p_imagelock);
>  #ifdef MAC
>  	mac_execve_exit(imgp);
>  	mac_execve_interpreter_exit(interpvplabel);
> diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
> index 9ce6d34..127629d 100644
> --- a/sys/kern/kern_exit.c
> +++ b/sys/kern/kern_exit.c
> @@ -290,6 +290,15 @@ exit1(struct thread *td, int rv)
>  
>  	p->p_xstat = rv;	/* Let event handler change exit status */
>  	PROC_UNLOCK(p);
> +
> +	/*
> +	 * The lock/unlock pair gives us a guarantee nobody pins the process
> +	 * with p_imagelock. While the process is still reachable, consumers
> +	 * are supposed to check for P_WEXIT flag set earlier.
> +	 */
> +	sx_xlock(&p->p_imagelock);
> +	sx_xunlock(&p->p_imagelock);
> +
>  	/* Drain the limit callout while we don't have the proc locked */
>  	callout_drain(&p->p_limco);
>  
> diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
> index 27c6f40..ac6b441 100644
> --- a/sys/kern/kern_proc.c
> +++ b/sys/kern/kern_proc.c
> @@ -230,6 +230,7 @@ proc_init(void *mem, int size, int flags)
>  	mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN | MTX_NEW);
>  	mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN | MTX_NEW);
>  	mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN | MTX_NEW);
> +	sx_init_flags(&p->p_imagelock, "process imagelock", SX_NEW);
>  	cv_init(&p->p_pwait, "ppwait");
>  	cv_init(&p->p_dbgwait, "dbgwait");
>  	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
> @@ -278,16 +279,20 @@ inferior(struct proc *p)
>  }
>  
>  struct proc *
> -pfind_locked(pid_t pid)
> +pfind_locked(pid_t pid, int lockimage)
>  {
>  	struct proc *p;
>  
>  	sx_assert(&allproc_lock, SX_LOCKED);
>  	LIST_FOREACH(p, PIDHASH(pid), p_hash) {
>  		if (p->p_pid == pid) {
> +			if (lockimage)
> +				sx_slock(&p->p_imagelock);
>  			PROC_LOCK(p);
>  			if (p->p_state == PRS_NEW) {
>  				PROC_UNLOCK(p);
> +				if (lockimage)
> +					sx_sunlock(&p->p_imagelock);
>  				p = NULL;
>  			}
>  			break;
> @@ -308,7 +313,7 @@ pfind(pid_t pid)
>  	struct proc *p;
>  
>  	sx_slock(&allproc_lock);
> -	p = pfind_locked(pid);
> +	p = pfind_locked(pid, 0);
>  	sx_sunlock(&allproc_lock);
>  	return (p);
>  }
> @@ -364,11 +369,17 @@ int
>  pget(pid_t pid, int flags, struct proc **pp)
>  {
>  	struct proc *p;
> -	int error;
> +	int error, lockimage;
> +
> +	lockimage = ((flags & PGET_IMAGELOCK) != 0);
> +	if (lockimage) {
> +		MPASS((flags & PGET_NOTWEXIT) != 0);
> +		MPASS((flags & PGET_HOLD) == 0);
> +	}
>  
>  	sx_slock(&allproc_lock);
>  	if (pid <= PID_MAX) {
> -		p = pfind_locked(pid);
> +		p = pfind_locked(pid, lockimage);
>  		if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
>  			p = zpfind_locked(pid);
>  	} else if ((flags & PGET_NOTID) == 0) {
> @@ -413,6 +424,8 @@ pget(pid_t pid, int flags, struct proc **pp)
>  	return (0);
>  errout:
>  	PROC_UNLOCK(p);
> +	if (lockimage)
> +		sx_sunlock(&p->p_imagelock);
>  	return (error);
>  }
>  
> diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
> index 1280807..a423c80 100644
> --- a/sys/kern/subr_witness.c
> +++ b/sys/kern/subr_witness.c
> @@ -475,6 +475,7 @@ static struct witness_order_list_entry order_lists[] = {
>  	{ "proctree", &lock_class_sx },
>  	{ "allproc", &lock_class_sx },
>  	{ "allprison", &lock_class_sx },
> +	{ "process imagelock", &lock_class_sx },
>  	{ NULL, NULL },
>  	/*
>  	 * Various mutexes
> diff --git a/sys/sys/proc.h b/sys/sys/proc.h
> index e6c83b4..5df817a 100644
> --- a/sys/sys/proc.h
> +++ b/sys/sys/proc.h
> @@ -522,6 +522,7 @@ struct proc {
>  					       (if I am reaper). */
>  	LIST_ENTRY(proc) p_reapsibling;	/* (e) List of siblings - descendants of
>  					       the same reaper. */
> +	struct sx	p_imagelock;	/* Lock blocking exec/exit. */
>  	struct mtx	p_mtx;		/* (n) Lock for this struct. */
>  	struct mtx	p_statmtx;	/* Lock for the stats */
>  	struct mtx	p_itimmtx;	/* Lock for the virt/prof timers */
> @@ -886,7 +887,7 @@ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
>  extern struct uma_zone *proc_zone;
>  
>  struct	proc *pfind(pid_t);		/* Find process by id. */
> -struct	proc *pfind_locked(pid_t pid);
> +struct	proc *pfind_locked(pid_t pid, int lockimage);
>  struct	pgrp *pgfind(pid_t);		/* Find process group by id. */
>  struct	proc *zpfind(pid_t);		/* Find zombie process by id. */
>  
> @@ -900,8 +901,10 @@ struct	proc *zpfind(pid_t);		/* Find zombie process by id. */
>  #define	PGET_NOTWEXIT	0x00010	/* Check that the process is not in P_WEXIT. */
>  #define	PGET_NOTINEXEC	0x00020	/* Check that the process is not in P_INEXEC. */
>  #define	PGET_NOTID	0x00040	/* Do not assume tid if pid > PID_MAX. */
> +#define	PGET_IMAGELOCK	0x00080 /* Prevent execs and exits. */
>  
>  #define	PGET_WANTREAD	(PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT)
> +#define	PGET_LOCK	(PGET_CANDEBUG | PGET_NOTWEXIT | PGET_IMAGELOCK)
>  
>  int	pget(pid_t pid, int flags, struct proc **pp);
>  
> -- 
> Mateusz Guzik <mjguzik gmail.com>