Extending MADV_PROTECT

John Baldwin jhb at freebsd.org
Tue May 7 18:33:42 UTC 2013


One of the issues I have with our current MADV_PROTECT is that it isn't very 
administrative-friendly.  That is, as a sysadmin I can't easily protect 
arbitrary processes from the OOM killer.  Instead, the binary has to be 
changed to invoke madvise().  Furthermore, once the protection is granted it 
can't be revoked.  Also, any binaries that want this have to be run as root.  
Instead, I would like to be able to both set and revoke this for existing 
processes and possibly even allow it to be inherited (so I can tag a top-level 
daemon that forks and have all its future children be protected for example).  
To that end I've whipped up a simple patch (against 8, but should port to HEAD 
easily if folks think it is a good idea) to add a new pprotect() system call 
and userland program (protect) that can be used similar to ktrace(1) either as 
a modifier when running a new program or as a tool for setting or clearing 
protection for existing processes.

The inherit feature isn't implemented yet, but it should be simple to do.  One 
would simply need a new flag that PPROT_INHERIT sets that is checked on fork 
and propagates P_PROTECTED if it is set.  Also, one other thought I had is 
that at some point we might want to make P_PROTECTED more fine-grained, e.g. 
by allowing for OOM "priorities".  To that end, it may make sense to add a new 
argument to protect, though you could also reserve part of the 'op' parameter 
to encode a priority.

The manpage for the proposed protect command is below, then the source of the 
command, then the patch to add pprotect():

PROTECT(1)              FreeBSD General Commands Manual             PROTECT(1)

NAME
     protect -- protect processes from being killed when swap space is
     exhausted

SYNOPSIS
     protect [-i] command
     protect [-cdi] -g pgrp | -p pid

DESCRIPTION
     The protect command is used to mark processes as protected.  The kernel
     does not kill protected processes when swap space is exhausted.  Note
     that this protected state is not inherited by child processes.

     The options are:

     -c      Remove protection from the specified processes.

     -d      Apply the operation to all current children of the specified pro-
             cesses.

     -i      Apply the operation to all future children of the specified pro-
             cesses.

     -g pgrp
             Apply the operation to all processes in the specified process
             group.

     -p pid  Apply the operation to the specified process.

     command
             Execute command as a protected process.

     Note that only one of the -p or -g flags may be specified when adjusting
     the state of existing processes.

EXIT STATUS
     The protect utility exits 0 on success, and >0 if an error occurs.

EXAMPLES
     Mark the Xorg server as protected:
           pgrep Xorg | xargs protect -p
     Protect all ssh sessions and their child processes:
           pgrep sshd | xargs protect -dip
     Remove protection from all current and future processes:
           protect -cdi -p 1

SEE ALSO
     pprotect(2)

BUGS
     If you protect a runaway process that allocates all memory the system
     will deadlock.

     Inheritance of the protected state is not yet implemented.

FreeBSD 8.2                       May 7, 2013                      FreeBSD 8.2

#include <sys/cdefs.h>
__FBSDID("$FreeBSD");

#include <sys/types.h>
#include <sys/mman.h>
#include <err.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

static void
usage(void)
{

	fprintf(stderr, "usage: protect [-i] command\n");
	fprintf(stderr, "       protect [-cdi] -g pgrp | -p pid\n");
	exit(1);
}

static pid_t
parse_pid(char *id)
{
	static bool first = true;
	long value;
	char *ch;

	if (!first) {
		warnx("only one -g or -p flag is permitted");
		usage();
	}
	value = strtol(id, &ch, 0);
	if (*ch != '\0') {
		warnx("invalid process id");
		usage();
	}
	return (value);
}

int
main(int argc, char *argv[])
{
	pid_t pid;
	int ch, op;
	bool descend, inherit, pidset;

	pid = getpid();
	op = PPROT_SET;
	descend = inherit = pidset = false;
	while ((ch = getopt(argc, argv, "cdig:p:")) != -1)
		switch (ch) {
		case 'c':
			op = PPROT_CLEAR;
			break;
		case 'd':
			descend = true;
			break;
		case 'i':
			inherit = true;
			break;
		case 'g':
			pid = -parse_pid(optarg);
			pidset = true;
			break;
		case 'p':
			pid = parse_pid(optarg);
			pidset = true;
			break;
		}
	argc -= optind;
	argv += optind;

	if ((pidset && argc != 0) || (!pidset && (argc == 0 || descend)))
		usage();

	if (descend)
		op |= PPROT_DESCEND;
	if (inherit)
		op |= PPROT_INHERIT;
	if (pprotect(op, pid) == -1)
		err(1, "request failed");

	if (argc != 0) {
		errno = 0;
		execvp(*argv, argv);
		err(errno == ENOENT ? 127 : 126, "%s", *argv);
	}
	return (0);
}

Index: sys/compat/freebsd32/syscalls.master
===================================================================
--- sys/compat/freebsd32/syscalls.master	(revision 251038)
+++ sys/compat/freebsd32/syscalls.master	(working copy)
@@ -977,3 +977,15 @@
 				    uint32_t offset1, uint32_t offset2,\
 				    uint32_t len1, uint32_t len2, \
 				    int advice); }
+532	AUE_NULL	UNIMPL	wait6
+533	AUE_NULL	UNIMPL	cap_rights_limit
+534	AUE_NULL	UNIMPL	cap_ioctls_limit
+535	AUE_NULL	UNIMPL	cap_ioctls_get
+536	AUE_NULL	UNIMPL	cap_fcntls_limit
+537	AUE_NULL	UNIMPL	cap_fcntls_get
+538	AUE_NULL	UNIMPL	bindat
+539	AUE_NULL	UNIMPL	connectat
+540	AUE_NULL	UNIMPL	chflagsat
+541	AUE_NULL	UNIMPL	accept4
+542	AUE_NULL	UNIMPL	pipe2
+543	AUE_NULL	NOPROTO	{ int pprotect(int op, pid_t pid); }
Index: sys/kern/syscalls.master
===================================================================
--- sys/kern/syscalls.master	(revision 251038)
+++ sys/kern/syscalls.master	(working copy)
@@ -938,5 +938,17 @@
 				    off_t offset, off_t len); }
 531	AUE_NULL	STD	{ int posix_fadvise(int fd, off_t offset, \
 				    off_t len, int advice); }
+532	AUE_NULL	UNIMPL	wait6
+533	AUE_NULL	UNIMPL	cap_rights_limit
+534	AUE_NULL	UNIMPL	cap_ioctls_limit
+535	AUE_NULL	UNIMPL	cap_ioctls_get
+536	AUE_NULL	UNIMPL	cap_fcntls_limit
+537	AUE_NULL	UNIMPL	cap_fcntls_get
+538	AUE_NULL	UNIMPL	bindat
+539	AUE_NULL	UNIMPL	connectat
+540	AUE_NULL	UNIMPL	chflagsat
+541	AUE_NULL	UNIMPL	accept4
+542	AUE_NULL	UNIMPL	pipe2
+543	AUE_NULL	STD	{ int pprotect(int op, pid_t pid); }
 ; Please copy any additions and changes to the following compatability 
tables:
 ; sys/compat/freebsd32/syscalls.master
Index: sys/sys/mman.h
===================================================================
--- sys/sys/mman.h	(revision 251038)
+++ sys/sys/mman.h	(working copy)
@@ -143,6 +143,22 @@
 #define	MINCORE_SUPER		0x20 /* Page is a "super" page */
 
 /*
+ * Operations for pprotect().
+ */
+#define	PPROT_OP_MASK		(0xf)
+#define	PPROT_SET		0
+#define	PPROT_CLEAR		1
+#define	PPROT_OP(x)		((x) & PPROT_OP_MASK)	/* Base operation. */
+
+/*
+ * Flags for pprotect (ORed in with operation).
+ */
+#define	PPROT_FLAG_MASK		(~PPROT_OP_MASK)
+#define	PPROT_DESCEND		0x10
+#define	PPROT_INHERIT		0x20
+#define	PPROT_FLAGS(x)		((x) & PPROT_FLAG_MASK)
+
+/*
  * Anonymous object constant for shm_open().
  */
 #define	SHM_ANON		((char *)1)
@@ -222,6 +238,7 @@
 int	madvise(void *, size_t, int);
 int	mincore(const void *, size_t, char *);
 int	minherit(void *, size_t, int);
+int	pprotect(int, pid_t);
 #endif
 int	mlock(const void *, size_t);
 #ifndef _MMAP_DECLARED
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h	(revision 251038)
+++ sys/sys/syscallsubr.h	(working copy)
@@ -154,6 +154,7 @@
 	    int advice);
 int	kern_posix_fallocate(struct thread *td, int fd, off_t offset,
 	    off_t len);
+int	kern_pprotect(struct thread *td, int op, pid_t pid);
 int	kern_preadv(struct thread *td, int fd, struct uio *auio, off_t 
offset);
 int	kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou,
 	    fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits);
Index: sys/vm/vm_mmap.c
===================================================================
--- sys/vm/vm_mmap.c	(revision 251038)
+++ sys/vm/vm_mmap.c	(working copy)
@@ -63,6 +63,7 @@
 #include <sys/mount.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 
@@ -668,23 +669,13 @@
 {
 	vm_offset_t start, end;
 	vm_map_t map;
-	struct proc *p;
-	int error;
 
 	/*
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
-	if (uap->behav == MADV_PROTECT) {
-		error = priv_check(td, PRIV_VM_MADV_PROTECT);
-		if (error == 0) {
-			p = td->td_proc;
-			PROC_LOCK(p);
-			p->p_flag |= P_PROTECTED;
-			PROC_UNLOCK(p);
-		}
-		return (error);
-	}
+	if (uap->behav == MADV_PROTECT)
+		return (kern_pprotect(td, PPROT_SET, td->td_proc->p_pid));
 	/*
 	 * Check for illegal behavior
 	 */
@@ -1102,6 +1093,154 @@
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct pprotect_args {
+	int op;
+	pid_t pid;
+};
+#endif
+int
+pprotect(td, uap)
+	struct thread *td;
+	struct pprotect_args *uap;
+{
+
+	return (kern_pprotect(td, uap->op, uap->pid));
+}
+
+static int
+pprot_setchild(struct thread *td, struct proc *p, int op)
+{
+	PROC_LOCK(p);
+	if (p->p_flag & P_SYSTEM || p_cansee(td, p) != 0) {
+		PROC_UNLOCK(p);
+		return (0);
+	}
+
+	switch (PPROT_OP(op)) {
+	case PPROT_SET:
+		p->p_flag |= P_PROTECTED;
+		break;
+	case PPROT_CLEAR:
+		p->p_flag &= ~P_PROTECTED;
+		break;
+	default:
+		PROC_UNLOCK(p);
+		return (0);
+	}
+	PROC_UNLOCK(p);
+	return (1);
+}
+
+static int
+pprot_setchildren(struct thread *td, struct proc *top, int op)
+{
+	struct proc *p;
+	int ret;
+
+	p = top;
+	ret = 0;
+	sx_assert(&proctree_lock, SX_LOCKED);
+	for (;;) {
+		ret |= pprot_setchild(td, p, op);
+		/*
+		 * If this process has children, descend to them next,
+		 * otherwise do any siblings, and if done with this level,
+		 * follow back up the tree (but not past top).
+		 */
+		if (!LIST_EMPTY(&p->p_children))
+			p = LIST_FIRST(&p->p_children);
+		else for (;;) {
+			if (p == top)
+				return (ret);
+			if (LIST_NEXT(p, p_sibling)) {
+				p = LIST_NEXT(p, p_sibling);
+				break;
+			}
+			p = p->p_pptr;
+		}
+	}
+}
+
+int
+kern_pprotect(struct thread *td, int op, pid_t pid)
+{
+	struct pgrp *pg;
+	struct proc *p;
+	int error, nfound, ret;
+
+	switch (PPROT_OP(op)) {
+	case PPROT_SET:
+	case PPROT_CLEAR:
+		break;
+	default:
+		return (EINVAL);
+	}
+	if ((PPROT_FLAGS(op) & ~(PPROT_DESCEND | PPROT_INHERIT)) != 0)
+		return (EINVAL);
+	if (op & PPROT_INHERIT)
+		return (EOPNOTSUPP);
+	
+	error = priv_check(td, PRIV_VM_MADV_PROTECT);
+	if (error)
+		return (error);
+
+	ret = 0;
+	sx_slock(&proctree_lock);
+	if (pid < 0) {
+		/* By process group. */
+		pg = pgfind(-pid);
+		if (pg == NULL) {
+			sx_sunlock(&proctree_lock);
+			return (ESRCH);
+		}
+		PGRP_UNLOCK(pg);
+		nfound = 0;
+		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NEW ||
+			    p_cansee(td, p) != 0) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			PROC_UNLOCK(p);
+			nfound++;
+			if (op & PPROT_DESCEND)
+				ret |= pprot_setchildren(td, p, op);
+			else
+				ret |= pprot_setchild(td, p, op);
+		}
+		if (nfound == 0) {
+			sx_sunlock(&proctree_lock);
+			return (ESRCH);
+		}
+	} else {
+		/* By pid. */
+		p = pfind(pid);
+		if (p == NULL) {
+			sx_sunlock(&proctree_lock);
+			return (ESRCH);
+		}
+		if (p->p_state == PRS_NEW)
+			error = ESRCH;
+		else
+			error = p_cansee(td, p);
+		PROC_UNLOCK(p);
+		if (error) {
+			sx_sunlock(&proctree_lock);
+			return (error);
+		}
+		if (op & PPROT_DESCEND)
+			ret |= pprot_setchildren(td, p, op);
+		else
+			ret |= pprot_setchild(td, p, op);
+	}
+	sx_sunlock(&proctree_lock);
+	if (ret == 0)
+		error = EPERM;
+	return (error);
+}
+
 /*
  * vm_mmap_vnode()
  *
Index: lib/libc/sys/Symbol.map
===================================================================
--- lib/libc/sys/Symbol.map	(revision 251033)
+++ lib/libc/sys/Symbol.map	(working copy)
@@ -366,6 +366,7 @@
 
 FBSD_1.3 {
 	posix_fadvise;
+	pprotect;
 };
 
 FBSDprivate_1.0 {

-- 
John Baldwin


More information about the freebsd-arch mailing list