svn commit: r189032 - in stable/7/sys: . contrib/pf dev/ath/ath_hal dev/cxgb security/audit

Wed Feb 25 04:00:17 PST 2009

Author: rwatson
Date: Wed Feb 25 12:00:15 2009
New Revision: 189032
URL: http://svn.freebsd.org/changeset/base/189032

Log:
  Merge r184508 from head to stable/7:
  
    Historically, /dev/auditpipe has allows only whole records to be read via
    read(2), which meant that records longer than the buffer passed to read(2)
    were dropped.  Instead take the approach of allowing partial reads to be
    continued across multiple system calls more in the style of streaming
    character device.
  
    This means retaining a record on the per-pipe queue in a partially read
    state, so maintain a current offset into the record.  Keep the record on
    the queue during a read, so add a new lock, ap_sx, to serialize removal
    of records from the queue by either read(2) or ioctl(2) requesting a pipe
    flush.  Modify the kqueue handler to return bytes left in the current
    record rather than simply the size of the current record.
  
    It is now possible to use praudit, which used the standard FILE * buffer
    sizes, to track much larger record sizes from /dev/auditpipe, such as
    very long command lines to execve(2).
  
    Sponsored by: Apple, Inc.

Modified:
  stable/7/sys/   (props changed)
  stable/7/sys/contrib/pf/   (props changed)
  stable/7/sys/dev/ath/ath_hal/   (props changed)
  stable/7/sys/dev/cxgb/   (props changed)
  stable/7/sys/security/audit/audit_pipe.c

Modified: stable/7/sys/security/audit/audit_pipe.c
==============================================================================

--- stable/7/sys/security/audit/audit_pipe.c	Wed Feb 25 11:44:03 2009	(r189031)
+++ stable/7/sys/security/audit/audit_pipe.c	Wed Feb 25 12:00:15 2009	(r189032)
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
+#include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
@@ -84,6 +85,7 @@ static MALLOC_DEFINE(M_AUDIT_PIPE_PRESEL
 struct audit_pipe_entry {
 	void				*ape_record;
 	u_int				 ape_record_len;
+	u_int				 ape_record_offset;
 	TAILQ_ENTRY(audit_pipe_entry)	 ape_queue;
 };
 
@@ -120,7 +122,15 @@ struct audit_pipe {
 	/*
 	 * Per-pipe mutex protecting most fields in this data structure.
 	 */
-	struct mtx			 ap_lock;
+	struct mtx			 ap_mtx;
+
+	/*
+	 * Per-pipe sleep lock serializing user-generated reads and flushes.
+	 * uiomove() is called to copy out the current head record's data
+	 * while the record remains in the queue, so we prevent other threads
+	 * from removing it using this lock.
+	 */
+	struct sx			 ap_sx;
 
 	/*
 	 * Condition variable to signal when data has been delivered to a
@@ -147,7 +157,9 @@ struct audit_pipe {
 	TAILQ_HEAD(, audit_pipe_preselect)	ap_preselect_list;
 
 	/*
-	 * Current pending record list.
+	 * Current pending record list.  Protected by a combination of ap_mtx
+	 * and ap_sx.  Note particularly that *both* locks are required to
+	 * remove a record from the head of the queue, as an in-progress read		 * may sleep while copying and therefore cannot hold ap_mtx.
 	 */
 	TAILQ_HEAD(, audit_pipe_entry)	 ap_queue;
 
@@ -157,13 +169,19 @@ struct audit_pipe {
 	TAILQ_ENTRY(audit_pipe)		 ap_list;
 };
 
-#define	AUDIT_PIPE_LOCK(ap)	mtx_lock(&(ap)->ap_lock)
-#define	AUDIT_PIPE_LOCK_ASSERT(ap)	mtx_assert(&(ap)->ap_lock, MA_OWNED)
-#define	AUDIT_PIPE_LOCK_DESTROY(ap)	mtx_destroy(&(ap)->ap_lock)
-#define	AUDIT_PIPE_LOCK_INIT(ap)	mtx_init(&(ap)->ap_lock, \
-					    "audit_pipe_lock", NULL, MTX_DEF)
-#define	AUDIT_PIPE_UNLOCK(ap)	mtx_unlock(&(ap)->ap_lock)
-#define	AUDIT_PIPE_MTX(ap)	(&(ap)->ap_lock)
+#define	AUDIT_PIPE_LOCK(ap)		mtx_lock(&(ap)->ap_mtx)
+#define	AUDIT_PIPE_LOCK_ASSERT(ap)	mtx_assert(&(ap)->ap_mtx, MA_OWNED)
+#define	AUDIT_PIPE_LOCK_DESTROY(ap)	mtx_destroy(&(ap)->ap_mtx)
+#define	AUDIT_PIPE_LOCK_INIT(ap)	mtx_init(&(ap)->ap_mtx, \
+					    "audit_pipe_mtx", NULL, MTX_DEF)
+#define	AUDIT_PIPE_UNLOCK(ap)		mtx_unlock(&(ap)->ap_mtx)
+#define	AUDIT_PIPE_MTX(ap)		(&(ap)->ap_mtx)
+
+#define	AUDIT_PIPE_SX_LOCK_DESTROY(ap)	sx_destroy(&(ap)->ap_sx)
+#define	AUDIT_PIPE_SX_LOCK_INIT(ap)	sx_init(&(ap)->ap_sx, "audit_pipe_sx")
+#define	AUDIT_PIPE_SX_XLOCK_ASSERT(ap)	sx_assert(&(ap)->ap_sx, SA_XLOCKED)
+#define	AUDIT_PIPE_SX_XLOCK_SIG(ap)	sx_xlock_sig(&(ap)->ap_sx)
+#define	AUDIT_PIPE_SX_XUNLOCK(ap)	sx_xunlock(&(ap)->ap_sx)
 
 /*
  * Global list of audit pipes, rwlock to protect it.  Individual record
@@ -461,6 +479,7 @@ audit_pipe_append(struct audit_pipe *ap,
 
 	bcopy(record, ape->ape_record, record_len);
 	ape->ape_record_len = record_len;
+	ape->ape_record_offset = 0;
 
 	TAILQ_INSERT_TAIL(&ap->ap_queue, ape, ape_queue);
 	ap->ap_inserts++;
@@ -534,26 +553,6 @@ audit_pipe_submit_user(void *record, u_i
 }
 
 /*
- * Pop the next record off of an audit pipe.
- */
-static struct audit_pipe_entry *
-audit_pipe_pop(struct audit_pipe *ap)
-{
-	struct audit_pipe_entry *ape;
-
-	AUDIT_PIPE_LOCK_ASSERT(ap);
-
-	ape = TAILQ_FIRST(&ap->ap_queue);
-	KASSERT((ape == NULL && ap->ap_qlen == 0) ||
-	    (ape != NULL && ap->ap_qlen != 0), ("audit_pipe_pop: qlen"));
-	if (ape == NULL)
-		return (NULL);
-	TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
-	ap->ap_qlen--;
-	return (ape);
-}
-
-/*
  * Allocate a new audit pipe.  Connects the pipe, on success, to the global
  * list and updates statistics.
  */
@@ -572,6 +571,7 @@ audit_pipe_alloc(void)
 	knlist_init(&ap->ap_selinfo.si_note, AUDIT_PIPE_MTX(ap), NULL, NULL,
 	    NULL);
 	AUDIT_PIPE_LOCK_INIT(ap);
+	AUDIT_PIPE_SX_LOCK_INIT(ap);
 	cv_init(&ap->ap_cv, "audit_pipe");
 
 	/*
@@ -630,6 +630,7 @@ audit_pipe_free(struct audit_pipe *ap)
 	audit_pipe_preselect_flush_locked(ap);
 	audit_pipe_flush(ap);
 	cv_destroy(&ap->ap_cv);
+	AUDIT_PIPE_SX_LOCK_DESTROY(ap);
 	AUDIT_PIPE_LOCK_DESTROY(ap);
 	knlist_destroy(&ap->ap_selinfo.si_note);
 	TAILQ_REMOVE(&audit_pipe_list, ap, ap_list);
@@ -758,7 +759,8 @@ audit_pipe_ioctl(struct cdev *dev, u_lon
 		AUDIT_PIPE_LOCK(ap);
 		if (TAILQ_FIRST(&ap->ap_queue) != NULL)
 			*(int *)data =
-			    TAILQ_FIRST(&ap->ap_queue)->ape_record_len;
+			    TAILQ_FIRST(&ap->ap_queue)->ape_record_len -
+			    TAILQ_FIRST(&ap->ap_queue)->ape_record_offset;
 		else
 			*(int *)data = 0;
 		AUDIT_PIPE_UNLOCK(ap);
@@ -892,9 +894,12 @@ audit_pipe_ioctl(struct cdev *dev, u_lon
 		break;
 
 	case AUDITPIPE_FLUSH:
+		if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
+			return (EINTR);
 		AUDIT_PIPE_LOCK(ap);
 		audit_pipe_flush(ap);
 		AUDIT_PIPE_UNLOCK(ap);
+		AUDIT_PIPE_SX_XUNLOCK(ap);
 		error = 0;
 		break;
 
@@ -949,45 +954,68 @@ audit_pipe_read(struct cdev *dev, struct
 {
 	struct audit_pipe_entry *ape;
 	struct audit_pipe *ap;
+	u_int toread;
 	int error;
 
 	ap = dev->si_drv1;
 	KASSERT(ap != NULL, ("audit_pipe_read: ap == NULL"));
 
+	/*
+	 * We hold an sx(9) lock over read and flush because we rely on the
+	 * stability of a record in the queue during uiomove(9).
+	 */
+	if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
+		return (EINTR);
 	AUDIT_PIPE_LOCK(ap);
-	do {
-		/*
-		 * Wait for a record that fits into the read buffer, dropping
-		 * records that would be truncated if actually passed to the
-		 * process.  This helps maintain the discreet record read
-		 * interface.
-		 */
-		while ((ape = audit_pipe_pop(ap)) == NULL) {
-			if (ap->ap_flags & AUDIT_PIPE_NBIO) {
-				AUDIT_PIPE_UNLOCK(ap);
-				return (EAGAIN);
-			}
-			error = cv_wait_sig(&ap->ap_cv, AUDIT_PIPE_MTX(ap));
-			if (error) {
-				AUDIT_PIPE_UNLOCK(ap);
-				return (error);
-			}
+	while (TAILQ_EMPTY(&ap->ap_queue)) {
+		if (ap->ap_flags & AUDIT_PIPE_NBIO) {
+			AUDIT_PIPE_UNLOCK(ap);
+			AUDIT_PIPE_SX_XUNLOCK(ap);
+			return (EAGAIN);
 		}
-		if (ape->ape_record_len <= uio->uio_resid)
-			break;
-		audit_pipe_entry_free(ape);
-		ap->ap_truncates++;
-	} while (1);
+		error = cv_wait_sig(&ap->ap_cv, AUDIT_PIPE_MTX(ap));
+		if (error) {
+			AUDIT_PIPE_UNLOCK(ap);
+			AUDIT_PIPE_SX_XUNLOCK(ap);
+			return (error);
+		}
+	}
+
+	/*
+	 * Copy as many remaining bytes from the current record to userspace
+	 * as we can.
+	 *
+	 * Note: we rely on the SX lock to maintain ape's stability here.
+	 */
 	ap->ap_reads++;
+	ape = TAILQ_FIRST(&ap->ap_queue);
+	toread = MIN(ape->ape_record_len - ape->ape_record_offset,
+	    uio->uio_resid);
 	AUDIT_PIPE_UNLOCK(ap);
+	error = uiomove((char *)ape->ape_record + ape->ape_record_offset,
+	    toread, uio);
+	if (error) {
+		AUDIT_PIPE_SX_XUNLOCK(ap);
+		return (error);
+	}
 
 	/*
-	 * Now read record to user space memory.  Even if the read is short,
-	 * we abandon the remainder of the record, supporting only discreet
-	 * record reads.
+	 * If the copy succeeded, update book-keeping, and if no bytes remain
+	 * in the current record, free it.
 	 */
-	error = uiomove(ape->ape_record, ape->ape_record_len, uio);
-	audit_pipe_entry_free(ape);
+	AUDIT_PIPE_LOCK(ap);
+	KASSERT(TAILQ_FIRST(&ap->ap_queue) == ape,
+	    ("audit_pipe_read: queue out of sync after uiomove"));
+	ape->ape_record_offset += toread;
+	if (ape->ape_record_offset == ape->ape_record_len) {
+		TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
+		ap->ap_qlen--;
+	} else
+		ape = NULL;
+	AUDIT_PIPE_UNLOCK(ap);
+	AUDIT_PIPE_SX_XUNLOCK(ap);
+	if (ape != NULL)
+		audit_pipe_entry_free(ape);
 	return (error);
 }
 
@@ -1056,7 +1084,7 @@ audit_pipe_kqread(struct knote *kn, long
 		ape = TAILQ_FIRST(&ap->ap_queue);
 		KASSERT(ape != NULL, ("audit_pipe_kqread: ape == NULL"));
 
-		kn->kn_data = ape->ape_record_len;
+		kn->kn_data = ape->ape_record_len - ape->ape_record_offset;
 		return (1);
 	} else {
 		kn->kn_data = 0;