Extending MADV_PROTECT
John Baldwin
jhb at freebsd.org
Tue May 7 18:33:42 UTC 2013
One of the issues I have with our current MADV_PROTECT is that it isn't very
administrative-friendly. That is, as a sysadmin I can't easily protect
arbitrary processes from the OOM killer. Instead, the binary has to be
changed to invoke madvise(). Furthermore, once the protection is granted it
can't be revoked. Also, any binaries that want this have to be run as root.
Instead, I would like to be able to both set and revoke this for existing
processes and possibly even allow it to be inherited (so I can tag a top-level
daemon that forks and have all its future children be protected for example).
To that end I've whipped up a simple patch (against 8, but should port to HEAD
easily if folks think it is a good idea) to add a new pprotect() system call
and userland program (protect) that can be used similar to ktrace(1) either as
a modifier when running a new program or as a tool for setting or clearing
protection for existing processes.
The inherit feature isn't implemented yet, but it should be simple to do. One
would simply need a new flag that PPROT_INHERIT sets that is checked on fork
and propagates P_PROTECTED if it is set. Also, one other thought I had is
that at some point we might want to make P_PROTECTED more fine-grained, e.g.
by allowing for OOM "priorities". To that end, it may make sense to add a new
argument to protect, though you could also reserve part of the 'op' parameter
to encode a priority.
The manpage for the proposed protect command is below, then the source of the
command, then the patch to add pprotect():
PROTECT(1) FreeBSD General Commands Manual PROTECT(1)
NAME
protect -- protect processes from being killed when swap space is
exhausted
SYNOPSIS
protect [-i] command
protect [-cdi] -g pgrp | -p pid
DESCRIPTION
The protect command is used to mark processes as protected. The kernel
does not kill protected processes when swap space is exhausted. Note
that this protected state is not inherited by child processes.
The options are:
-c Remove protection from the specified processes.
-d Apply the operation to all current children of the specified pro-
cesses.
-i Apply the operation to all future children of the specified pro-
cesses.
-g pgrp
Apply the operation to all processes in the specified process
group.
-p pid Apply the operation to the specified process.
command
Execute command as a protected process.
Note that only one of the -p or -g flags may be specified when adjusting
the state of existing processes.
EXIT STATUS
The protect utility exits 0 on success, and >0 if an error occurs.
EXAMPLES
Mark the Xorg server as protected:
pgrep Xorg | xargs protect -p
Protect all ssh sessions and their child processes:
pgrep sshd | xargs protect -dip
Remove protection from all current and future processes:
protect -cdi -p 1
SEE ALSO
pprotect(2)
BUGS
If you protect a runaway process that allocates all memory the system
will deadlock.
Inheritance of the protected state is not yet implemented.
FreeBSD 8.2 May 7, 2013 FreeBSD 8.2
#include <sys/cdefs.h>
__FBSDID("$FreeBSD");
#include <sys/types.h>
#include <sys/mman.h>
#include <err.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
static void
usage(void)
{
fprintf(stderr, "usage: protect [-i] command\n");
fprintf(stderr, " protect [-cdi] -g pgrp | -p pid\n");
exit(1);
}
static pid_t
parse_pid(char *id)
{
static bool first = true;
long value;
char *ch;
if (!first) {
warnx("only one -g or -p flag is permitted");
usage();
}
value = strtol(id, &ch, 0);
if (*ch != '\0') {
warnx("invalid process id");
usage();
}
return (value);
}
int
main(int argc, char *argv[])
{
pid_t pid;
int ch, op;
bool descend, inherit, pidset;
pid = getpid();
op = PPROT_SET;
descend = inherit = pidset = false;
while ((ch = getopt(argc, argv, "cdig:p:")) != -1)
switch (ch) {
case 'c':
op = PPROT_CLEAR;
break;
case 'd':
descend = true;
break;
case 'i':
inherit = true;
break;
case 'g':
pid = -parse_pid(optarg);
pidset = true;
break;
case 'p':
pid = parse_pid(optarg);
pidset = true;
break;
}
argc -= optind;
argv += optind;
if ((pidset && argc != 0) || (!pidset && (argc == 0 || descend)))
usage();
if (descend)
op |= PPROT_DESCEND;
if (inherit)
op |= PPROT_INHERIT;
if (pprotect(op, pid) == -1)
err(1, "request failed");
if (argc != 0) {
errno = 0;
execvp(*argv, argv);
err(errno == ENOENT ? 127 : 126, "%s", *argv);
}
return (0);
}
Index: sys/compat/freebsd32/syscalls.master
===================================================================
--- sys/compat/freebsd32/syscalls.master (revision 251038)
+++ sys/compat/freebsd32/syscalls.master (working copy)
@@ -977,3 +977,15 @@
uint32_t offset1, uint32_t offset2,\
uint32_t len1, uint32_t len2, \
int advice); }
+532 AUE_NULL UNIMPL wait6
+533 AUE_NULL UNIMPL cap_rights_limit
+534 AUE_NULL UNIMPL cap_ioctls_limit
+535 AUE_NULL UNIMPL cap_ioctls_get
+536 AUE_NULL UNIMPL cap_fcntls_limit
+537 AUE_NULL UNIMPL cap_fcntls_get
+538 AUE_NULL UNIMPL bindat
+539 AUE_NULL UNIMPL connectat
+540 AUE_NULL UNIMPL chflagsat
+541 AUE_NULL UNIMPL accept4
+542 AUE_NULL UNIMPL pipe2
+543 AUE_NULL NOPROTO { int pprotect(int op, pid_t pid); }
Index: sys/kern/syscalls.master
===================================================================
--- sys/kern/syscalls.master (revision 251038)
+++ sys/kern/syscalls.master (working copy)
@@ -938,5 +938,17 @@
off_t offset, off_t len); }
531 AUE_NULL STD { int posix_fadvise(int fd, off_t offset, \
off_t len, int advice); }
+532 AUE_NULL UNIMPL wait6
+533 AUE_NULL UNIMPL cap_rights_limit
+534 AUE_NULL UNIMPL cap_ioctls_limit
+535 AUE_NULL UNIMPL cap_ioctls_get
+536 AUE_NULL UNIMPL cap_fcntls_limit
+537 AUE_NULL UNIMPL cap_fcntls_get
+538 AUE_NULL UNIMPL bindat
+539 AUE_NULL UNIMPL connectat
+540 AUE_NULL UNIMPL chflagsat
+541 AUE_NULL UNIMPL accept4
+542 AUE_NULL UNIMPL pipe2
+543 AUE_NULL STD { int pprotect(int op, pid_t pid); }
; Please copy any additions and changes to the following compatability
tables:
; sys/compat/freebsd32/syscalls.master
Index: sys/sys/mman.h
===================================================================
--- sys/sys/mman.h (revision 251038)
+++ sys/sys/mman.h (working copy)
@@ -143,6 +143,22 @@
#define MINCORE_SUPER 0x20 /* Page is a "super" page */
/*
+ * Operations for pprotect().
+ */
+#define PPROT_OP_MASK (0xf)
+#define PPROT_SET 0
+#define PPROT_CLEAR 1
+#define PPROT_OP(x) ((x) & PPROT_OP_MASK) /* Base operation. */
+
+/*
+ * Flags for pprotect (ORed in with operation).
+ */
+#define PPROT_FLAG_MASK (~PPROT_OP_MASK)
+#define PPROT_DESCEND 0x10
+#define PPROT_INHERIT 0x20
+#define PPROT_FLAGS(x) ((x) & PPROT_FLAG_MASK)
+
+/*
* Anonymous object constant for shm_open().
*/
#define SHM_ANON ((char *)1)
@@ -222,6 +238,7 @@
int madvise(void *, size_t, int);
int mincore(const void *, size_t, char *);
int minherit(void *, size_t, int);
+int pprotect(int, pid_t);
#endif
int mlock(const void *, size_t);
#ifndef _MMAP_DECLARED
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h (revision 251038)
+++ sys/sys/syscallsubr.h (working copy)
@@ -154,6 +154,7 @@
int advice);
int kern_posix_fallocate(struct thread *td, int fd, off_t offset,
off_t len);
+int kern_pprotect(struct thread *td, int op, pid_t pid);
int kern_preadv(struct thread *td, int fd, struct uio *auio, off_t
offset);
int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou,
fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits);
Index: sys/vm/vm_mmap.c
===================================================================
--- sys/vm/vm_mmap.c (revision 251038)
+++ sys/vm/vm_mmap.c (working copy)
@@ -63,6 +63,7 @@
#include <sys/mount.h>
#include <sys/conf.h>
#include <sys/stat.h>
+#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/vmmeter.h>
@@ -668,23 +669,13 @@
{
vm_offset_t start, end;
vm_map_t map;
- struct proc *p;
- int error;
/*
* Check for our special case, advising the swap pager we are
* "immortal."
*/
- if (uap->behav == MADV_PROTECT) {
- error = priv_check(td, PRIV_VM_MADV_PROTECT);
- if (error == 0) {
- p = td->td_proc;
- PROC_LOCK(p);
- p->p_flag |= P_PROTECTED;
- PROC_UNLOCK(p);
- }
- return (error);
- }
+ if (uap->behav == MADV_PROTECT)
+ return (kern_pprotect(td, PPROT_SET, td->td_proc->p_pid));
/*
* Check for illegal behavior
*/
@@ -1102,6 +1093,154 @@
return (error == KERN_SUCCESS ? 0 : ENOMEM);
}
+#ifndef _SYS_SYSPROTO_H_
+struct pprotect_args {
+ int op;
+ pid_t pid;
+};
+#endif
+int
+pprotect(td, uap)
+ struct thread *td;
+ struct pprotect_args *uap;
+{
+
+ return (kern_pprotect(td, uap->op, uap->pid));
+}
+
+static int
+pprot_setchild(struct thread *td, struct proc *p, int op)
+{
+ PROC_LOCK(p);
+ if (p->p_flag & P_SYSTEM || p_cansee(td, p) != 0) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+
+ switch (PPROT_OP(op)) {
+ case PPROT_SET:
+ p->p_flag |= P_PROTECTED;
+ break;
+ case PPROT_CLEAR:
+ p->p_flag &= ~P_PROTECTED;
+ break;
+ default:
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ PROC_UNLOCK(p);
+ return (1);
+}
+
+static int
+pprot_setchildren(struct thread *td, struct proc *top, int op)
+{
+ struct proc *p;
+ int ret;
+
+ p = top;
+ ret = 0;
+ sx_assert(&proctree_lock, SX_LOCKED);
+ for (;;) {
+ ret |= pprot_setchild(td, p, op);
+ /*
+ * If this process has children, descend to them next,
+ * otherwise do any siblings, and if done with this level,
+ * follow back up the tree (but not past top).
+ */
+ if (!LIST_EMPTY(&p->p_children))
+ p = LIST_FIRST(&p->p_children);
+ else for (;;) {
+ if (p == top)
+ return (ret);
+ if (LIST_NEXT(p, p_sibling)) {
+ p = LIST_NEXT(p, p_sibling);
+ break;
+ }
+ p = p->p_pptr;
+ }
+ }
+}
+
+int
+kern_pprotect(struct thread *td, int op, pid_t pid)
+{
+ struct pgrp *pg;
+ struct proc *p;
+ int error, nfound, ret;
+
+ switch (PPROT_OP(op)) {
+ case PPROT_SET:
+ case PPROT_CLEAR:
+ break;
+ default:
+ return (EINVAL);
+ }
+ if ((PPROT_FLAGS(op) & ~(PPROT_DESCEND | PPROT_INHERIT)) != 0)
+ return (EINVAL);
+ if (op & PPROT_INHERIT)
+ return (EOPNOTSUPP);
+
+ error = priv_check(td, PRIV_VM_MADV_PROTECT);
+ if (error)
+ return (error);
+
+ ret = 0;
+ sx_slock(&proctree_lock);
+ if (pid < 0) {
+ /* By process group. */
+ pg = pgfind(-pid);
+ if (pg == NULL) {
+ sx_sunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ PGRP_UNLOCK(pg);
+ nfound = 0;
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW ||
+ p_cansee(td, p) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ PROC_UNLOCK(p);
+ nfound++;
+ if (op & PPROT_DESCEND)
+ ret |= pprot_setchildren(td, p, op);
+ else
+ ret |= pprot_setchild(td, p, op);
+ }
+ if (nfound == 0) {
+ sx_sunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ } else {
+ /* By pid. */
+ p = pfind(pid);
+ if (p == NULL) {
+ sx_sunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ if (p->p_state == PRS_NEW)
+ error = ESRCH;
+ else
+ error = p_cansee(td, p);
+ PROC_UNLOCK(p);
+ if (error) {
+ sx_sunlock(&proctree_lock);
+ return (error);
+ }
+ if (op & PPROT_DESCEND)
+ ret |= pprot_setchildren(td, p, op);
+ else
+ ret |= pprot_setchild(td, p, op);
+ }
+ sx_sunlock(&proctree_lock);
+ if (ret == 0)
+ error = EPERM;
+ return (error);
+}
+
/*
* vm_mmap_vnode()
*
Index: lib/libc/sys/Symbol.map
===================================================================
--- lib/libc/sys/Symbol.map (revision 251033)
+++ lib/libc/sys/Symbol.map (working copy)
@@ -366,6 +366,7 @@
FBSD_1.3 {
posix_fadvise;
+ pprotect;
};
FBSDprivate_1.0 {
--
John Baldwin
More information about the freebsd-arch
mailing list