svn commit: r250574 - in head: cddl/contrib/opensolaris/cmd/dtrace cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma cddl/cont...

Sun May 12 16:26:35 UTC 2013

Author: markj
Date: Sun May 12 16:26:33 2013
New Revision: 250574
URL: http://svnweb.freebsd.org/changeset/base/250574

Log:
  Bring back part of r249367 by adding DTrace's temporal option, which allows
  users to guarantee that the output of DTrace scripts will be time-ordered.
  This option is enabled by adding the line
  
    #pragma D option temporal
  
  to the beginning of a script, or by adding '-x temporal' to the arguments of
  dtrace(1).
  
  This change fixes a bug in the original port of the temporal option. This
  bug was causing some assertions to fail, so they had been disabled; in this
  revision the assertions are working properly and are enabled.
  
  The DTrace version number has been bumped from 1.9.0 to 1.9.1 to reflect
  the language change that's being introduced.
  
  This change corresponds to part of illumos-gate commit e5803b76927480:
    3021 option for time-ordered output from dtrace(1M)
  
  Reviewed by:	pfg
  Obtained from:	illumos
  MFC after:	1 month

Added:
  head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh
     - copied unchanged from r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh
  head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh
     - copied unchanged from r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh
  head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d
     - copied unchanged from r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d
  head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.c
     - copied unchanged from r250296, vendor/illumos/dist/lib/libdtrace/common/dt_pq.c
  head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_pq.h
     - copied unchanged from r250296, vendor/illumos/dist/lib/libdtrace/common/dt_pq.h
Modified:
  head/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c
  head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d
  head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d
  head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c
  head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_impl.h
  head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_open.c
  head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_options.c
  head/cddl/lib/libdtrace/Makefile
  head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
  head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
  head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
  head/sys/cddl/dev/dtrace/dtrace_ioctl.c

Modified: head/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c
==============================================================================

--- head/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c	Sun May 12 16:26:19 2013	(r250573)
+++ head/cddl/contrib/opensolaris/cmd/dtrace/dtrace.c	Sun May 12 16:26:33 2013	(r250574)
@@ -23,8 +23,9 @@
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -1409,6 +1410,7 @@ main(int argc, char *argv[])
 	(void) dtrace_setopt(g_dtp, "bufsize", "4m");
 	(void) dtrace_setopt(g_dtp, "aggsize", "4m");
 #endif
+	(void) dtrace_setopt(g_dtp, "temporal", "yes");
 
 	/*
 	 * If -G is specified, enable -xlink=dynamic and -xunodefs to permit

Modified: head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d	Sun May 12 16:26:19 2013	(r250573)
+++ head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/buffering/tst.fill1.d	Sun May 12 16:26:33 2013	(r250574)
@@ -23,26 +23,29 @@
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 /*
  * ASSERTION:
  *   Positive test for fill buffer policy.
  *
  * SECTION: Buffers and Buffering/fill Policy;
- * 	Buffers and Buffering/Buffer Sizes;
+ *	Buffers and Buffering/Buffer Sizes;
  *	Options and Tunables/bufsize;
  *	Options and Tunables/bufpolicy;
  *	Options and Tunables/statusrate
  */
 /*
- * This is a brute-force way of testing fill buffers.  We assume that each
- * printf() stores 8 bytes.  Because each fill buffer is per-CPU, we must
- * fill up our buffer in one series of enablings on a single CPU.
+ * This is a brute-force way of testing fill buffers.  We assume that
+ * each printf() stores 16 bytes (4x 32-bit words for EPID, timestamp
+ * lo, timestamp hi, and the variable i).  Because each fill buffer is
+ * per-CPU, we must fill up our buffer in one series of enablings on a
+ * single CPU.
  */
 #pragma D option bufpolicy=fill
-#pragma D option bufsize=64
+#pragma D option bufsize=128
 #pragma D option statusrate=10ms
 #pragma D option quiet
 

Copied: head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh (from r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh	Sun May 12 16:26:33 2013	(r250574, copy of r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal.ksh)
@@ -0,0 +1,106 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2012 by Delphix. All rights reserved.
+#
+
+############################################################################
+# ASSERTION:
+#	temporal option causes output to be sorted
+#
+# SECTION: Pragma
+#
+# NOTES: The temporal option has no effect on a single-CPU system, so
+#    this needs to be run on a multi-CPU system to effectively test the
+#    temporal option.
+#
+############################################################################
+
+if [ $# != 1 ]; then
+	echo expected one argument: '<'dtrace-path'>'
+	exit 2
+fi
+
+dtrace=$1
+file=/tmp/out.$$
+
+rm -f $file
+
+$dtrace -o $file -c 'sleep 3' -s /dev/stdin <<EOF
+	#pragma D option quiet
+	#pragma D option temporal
+
+	BEGIN
+	{
+		@lines = count();
+		printf("0 begin\n");
+	}
+
+	END
+	{
+		/* Bump @lines every time we print a line. */
+		@lines = count();
+		printf("%u end\n", timestamp);
+		@lines = count();
+		printa("99999999999999999 lines %@u\n", @lines);
+	}
+
+	profile-97hz
+	{
+		@lines = count();
+		printf("%u\n", timestamp);
+	}
+EOF
+
+status=$?
+if [ "$status" -ne 0 ]; then
+	echo $tst: dtrace failed
+	exit $status
+fi
+
+# dtrace outputs a blank line at the end, which will sort to the beginning,
+# so use head to remove the blank line.
+head -n -1 $file > $file.2
+
+sort -n $file.2 | diff $file.2 -
+status=$?
+if [ "$status" -ne 0 ]; then
+	echo $tst: output is not sorted
+	exit $status
+fi
+
+head -n 1 $file.2 | grep begin >/dev/null
+status=$?
+if [ "$status" -ne 0 ]; then
+	echo $tst: begin probe did not fire
+	exit $status
+fi
+
+tail -n 2 $file.2 | grep end >/dev/null
+status=$?
+if [ "$status" -ne 0 ]; then
+	echo $tst: end probe did not fire
+	exit $status
+fi
+
+if [ $(tail -n 1 $file.2 | cut -f3 -d ' ') -ne \
+    $(wc -l $file.2) ]; then
+	echo $tst: incorrect number of lines output
+	exit 1
+fi
+
+exit $status

Copied: head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh (from r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh	Sun May 12 16:26:33 2013	(r250574, copy of r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal2.ksh)
@@ -0,0 +1,102 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2012 by Delphix. All rights reserved.
+#
+
+############################################################################
+# ASSERTION:
+#	temporal option causes output to be sorted, even when some
+#	buffers are empty
+#
+# SECTION: Pragma
+#
+# NOTES: The temporal option has no effect on a single-CPU system, so
+#    this needs to be run on a multi-CPU system to effectively test the
+#    temporal option.
+#
+############################################################################
+
+if [ $# != 1 ]; then
+	echo expected one argument: '<'dtrace-path'>'
+	exit 2
+fi
+
+dtrace=$1
+file=/tmp/out.$$
+
+rm -f $file
+
+$dtrace -o $file -s /dev/stdin <<EOF
+	#pragma D option quiet
+	#pragma D option destructive
+	#pragma D option temporal
+	#pragma D option switchrate=1000hz
+
+	/*
+	 * Use two enablings of the same probe, so that cpu 0 will always
+	 * record its data just a little bit before the other cpus.
+	 * We don't want to use the chill() action in the same enabling
+	 * that we record the timestamp, because chill() causes the
+	 * timestamp to be re-read, and thus not match the timestamp
+	 * which libdtrace uses to sort the records.
+	 */
+
+	profile-401
+	/cpu == 0/
+	{
+		printf("%d\n", timestamp);
+	}
+
+	profile-401
+	/cpu != 0/
+	{
+		chill(1000); /* one microsecond */
+	}
+
+	profile-401
+	/cpu != 0/
+	{
+		printf("%d\n", timestamp);
+	}
+
+	tick-1s
+	/k++ == 10/
+	{
+		printf("%d\n", timestamp);
+		exit(0);
+	}
+EOF
+
+status=$?
+if [ "$status" -ne 0 ]; then
+	echo $tst: dtrace failed
+	exit $status
+fi
+
+# dtrace outputs a blank line at the end, which will sort to the beginning,
+# so use grep to remove the blank line.
+head -n -1 $file > $file.2
+
+sort -n $file.2 | diff $file.2 -
+status=$?
+if [ "$status" -ne 0 ]; then
+	echo $tst: output is not sorted
+	exit $status
+fi
+
+exit $status

Copied: head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d (from r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d	Sun May 12 16:26:33 2013	(r250574, copy of r250296, vendor/illumos/dist/cmd/dtrace/test/tst/common/pragma/tst.temporal3.d)
@@ -0,0 +1,48 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+/*
+ * This test excercises the "remnant" handling of the temporal option.
+ * At the end of one pass of retrieving and printing data from all CPUs,
+ * some unprocessed data will remain, because its timestamp is after the
+ * time covered by all CPUs' buffers.  This unprocessed data is
+ * rearranged in a more space-efficient manner.  If this is done
+ * incorrectly, an alignment error may occur.  To test this, we use a
+ * high-frequency probe so that data will be recorded in subsequent
+ * CPU's buffers after the first CPU's buffer is obtained.  The
+ * combination of data traced here (a 8-byte value and a 4-byte value)
+ * is effective to cause alignment problems with an incorrect
+ * implementation.
+ *
+ * This test needs to be run on a multi-CPU system to be effective.
+ */
+
+#pragma D option quiet
+#pragma D option temporal
+
+profile-4997
+{
+	printf("%u %u", 1ULL, 2);
+}
+
+tick-1
+/i++ == 10/
+{
+	exit(0);
+}

Modified: head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d	Sun May 12 16:26:19 2013	(r250573)
+++ head/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/speculation/err.BufSizeVariations1.d	Sun May 12 16:26:33 2013	(r250574)
@@ -24,7 +24,10 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 
 /*
  * ASSERTION:
@@ -35,17 +38,10 @@
  *
  * NOTES: This test behaves differently depending on the values
  * assigned to bufsize.
- * 1. 0 > bufsize.
- * 2. 0 == bufsize.
- * 3. 0 < bufsize <= 7
- * 4. 8 <= bufsize <= 31
- * 5. 32 <= bufsize <= 47
- * 6. 48 <= bufsize <= 71
- * 7. 72 <= bufsize
  */
 
 #pragma D option quiet
-#pragma D option bufsize=41
+#pragma D option bufsize=49
 
 BEGIN
 {

Modified: head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c	Sun May 12 16:26:19 2013	(r250573)
+++ head/cddl/contrib/opensolaris/lib/libdtrace/common/dt_consume.c	Sun May 12 16:26:33 2013	(r250574)
@@ -25,7 +25,7 @@
 
 /*
  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <stdlib.h>
@@ -39,6 +39,7 @@
 #include <alloca.h>
 #endif
 #include <dt_impl.h>
+#include <dt_pq.h>
 #if !defined(sun)
 #include <libproc_compat.h>
 #endif
@@ -443,17 +444,8 @@ dt_flowindent(dtrace_hdl_t *dtp, dtrace_
 		offs += epd->dtepd_size;
 
 		do {
-			if (offs >= buf->dtbd_size) {
-				/*
-				 * We're at the end -- maybe.  If the oldest
-				 * record is non-zero, we need to wrap.
-				 */
-				if (buf->dtbd_oldest != 0) {
-					offs = 0;
-				} else {
-					goto out;
-				}
-			}
+			if (offs >= buf->dtbd_size)
+				goto out;
 
 			next = *(uint32_t *)((uintptr_t)buf->dtbd_data + offs);
 
@@ -2014,26 +2006,27 @@ dt_setopt(dtrace_hdl_t *dtp, const dtrac
 }
 
 static int
-dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu, dtrace_bufdesc_t *buf,
+dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu,
+    dtrace_bufdesc_t *buf, boolean_t just_one,
     dtrace_consume_probe_f *efunc, dtrace_consume_rec_f *rfunc, void *arg)
 {
 	dtrace_epid_t id;
-	size_t offs, start = buf->dtbd_oldest, end = buf->dtbd_size;
+	size_t offs;
 	int flow = (dtp->dt_options[DTRACEOPT_FLOWINDENT] != DTRACEOPT_UNSET);
 	int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET);
 	int rval, i, n;
-	dtrace_epid_t last = DTRACE_EPIDNONE;
 	uint64_t tracememsize = 0;
 	dtrace_probedata_t data;
 	uint64_t drops;
-	caddr_t addr;
+	data.dtpda_flow = dtp->dt_flow;
+	data.dtpda_indent = dtp->dt_indent;
+	data.dtpda_prefix = dtp->dt_prefix;
 
 	bzero(&data, sizeof (data));
 	data.dtpda_handle = dtp;
 	data.dtpda_cpu = cpu;
 
-again:
-	for (offs = start; offs < end; ) {
+	for (offs = buf->dtbd_oldest; offs < buf->dtbd_size; ) {
 		dtrace_eprobedesc_t *epd;
 
 		/*
@@ -2068,7 +2061,8 @@ again:
 		}
 
 		if (flow)
-			(void) dt_flowindent(dtp, &data, last, buf, offs);
+			(void) dt_flowindent(dtp, &data, dtp->dt_last_epid,
+			    buf, offs);
 
 		rval = (*efunc)(&data, arg);
 
@@ -2087,6 +2081,7 @@ again:
 			return (dt_set_errno(dtp, EDT_BADRVAL));
 
 		for (i = 0; i < epd->dtepd_nrecs; i++) {
+			caddr_t addr;
 			dtrace_recdesc_t *rec = &epd->dtepd_rec[i];
 			dtrace_actkind_t act = rec->dtrd_action;
 
@@ -2458,14 +2453,16 @@ nextrec:
 		rval = (*rfunc)(&data, NULL, arg);
 nextepid:
 		offs += epd->dtepd_size;
-		last = id;
+		dtp->dt_last_epid = id;
+		if (just_one) {
+			buf->dtbd_oldest = offs;
+			break;
+		}
 	}
 
-	if (buf->dtbd_oldest != 0 && start == buf->dtbd_oldest) {
-		end = buf->dtbd_oldest;
-		start = 0;
-		goto again;
-	}
+	dtp->dt_flow = data.dtpda_flow;
+	dtp->dt_indent = data.dtpda_indent;
+	dtp->dt_prefix = data.dtpda_prefix;
 
 	if ((drops = buf->dtbd_drops) == 0)
 		return (0);
@@ -2478,6 +2475,130 @@ nextepid:
 	return (dt_handle_cpudrop(dtp, cpu, DTRACEDROP_PRINCIPAL, drops));
 }
 
+/*
+ * Reduce memory usage by shrinking the buffer if it's no more than half full.
+ * Note, we need to preserve the alignment of the data at dtbd_oldest, which is
+ * only 4-byte aligned.
+ */
+static void
+dt_realloc_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf, int cursize)
+{
+	uint64_t used = buf->dtbd_size - buf->dtbd_oldest;
+	if (used < cursize / 2) {
+		int misalign = buf->dtbd_oldest & (sizeof (uint64_t) - 1);
+		char *newdata = dt_alloc(dtp, used + misalign);
+		if (newdata == NULL)
+			return;
+		bzero(newdata, misalign);
+		bcopy(buf->dtbd_data + buf->dtbd_oldest,
+		    newdata + misalign, used);
+		dt_free(dtp, buf->dtbd_data);
+		buf->dtbd_oldest = misalign;
+		buf->dtbd_size = used + misalign;
+		buf->dtbd_data = newdata;
+	}
+}
+
+/*
+ * If the ring buffer has wrapped, the data is not in order.  Rearrange it
+ * so that it is.  Note, we need to preserve the alignment of the data at
+ * dtbd_oldest, which is only 4-byte aligned.
+ */
+static int
+dt_unring_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf)
+{
+	int misalign;
+	char *newdata, *ndp;
+
+	if (buf->dtbd_oldest == 0)
+		return (0);
+
+	misalign = buf->dtbd_oldest & (sizeof (uint64_t) - 1);
+	newdata = ndp = dt_alloc(dtp, buf->dtbd_size + misalign);
+
+	if (newdata == NULL)
+		return (-1);
+
+	assert(0 == (buf->dtbd_size & (sizeof (uint64_t) - 1)));
+
+	bzero(ndp, misalign);
+	ndp += misalign;
+
+	bcopy(buf->dtbd_data + buf->dtbd_oldest, ndp,
+	    buf->dtbd_size - buf->dtbd_oldest);
+	ndp += buf->dtbd_size - buf->dtbd_oldest;
+
+	bcopy(buf->dtbd_data, ndp, buf->dtbd_oldest);
+
+	dt_free(dtp, buf->dtbd_data);
+	buf->dtbd_oldest = 0;
+	buf->dtbd_data = newdata;
+	buf->dtbd_size += misalign;
+
+	return (0);
+}
+
+static void
+dt_put_buf(dtrace_hdl_t *dtp, dtrace_bufdesc_t *buf)
+{
+	dt_free(dtp, buf->dtbd_data);
+	dt_free(dtp, buf);
+}
+
+/*
+ * Returns 0 on success, in which case *cbp will be filled in if we retrieved
+ * data, or NULL if there is no data for this CPU.
+ * Returns -1 on failure and sets dt_errno.
+ */
+static int
+dt_get_buf(dtrace_hdl_t *dtp, int cpu, dtrace_bufdesc_t **bufp)
+{
+	dtrace_optval_t size;
+	dtrace_bufdesc_t *buf = dt_zalloc(dtp, sizeof (*buf));
+	int error;
+
+	if (buf == NULL)
+		return (-1);
+
+	(void) dtrace_getopt(dtp, "bufsize", &size);
+	buf->dtbd_data = dt_alloc(dtp, size);
+	if (buf->dtbd_data == NULL) {
+		dt_free(dtp, buf);
+		return (-1);
+	}
+	buf->dtbd_size = size;
+	buf->dtbd_cpu = cpu;
+
+#if defined(sun)
+	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
+#else
+	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) {
+#endif
+		dt_put_buf(dtp, buf);
+		/*
+		 * If we failed with ENOENT, it may be because the
+		 * CPU was unconfigured -- this is okay.  Any other
+		 * error, however, is unexpected.
+		 */
+		if (errno == ENOENT) {
+			*bufp = NULL;
+			return (0);
+		}
+
+		return (dt_set_errno(dtp, errno));
+	}
+
+	error = dt_unring_buf(dtp, buf);
+	if (error != 0) {
+		dt_put_buf(dtp, buf);
+		return (error);
+	}
+	dt_realloc_buf(dtp, buf, size);
+
+	*bufp = buf;
+	return (0);
+}
+
 typedef struct dt_begin {
 	dtrace_consume_probe_f *dtbgn_probefunc;
 	dtrace_consume_rec_f *dtbgn_recfunc;
@@ -2541,7 +2662,7 @@ dt_consume_begin_error(const dtrace_errd
 }
 
 static int
-dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp, dtrace_bufdesc_t *buf,
+dt_consume_begin(dtrace_hdl_t *dtp, FILE *fp,
     dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg)
 {
 	/*
@@ -2565,33 +2686,19 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE
 	 * first pass, and that we only process ERROR enablings _not_ induced
 	 * by BEGIN enablings in the second pass.
 	 */
+
 	dt_begin_t begin;
 	processorid_t cpu = dtp->dt_beganon;
-	dtrace_bufdesc_t nbuf;
-#if !defined(sun)
-	dtrace_bufdesc_t *pbuf;
-#endif
 	int rval, i;
 	static int max_ncpus;
-	dtrace_optval_t size;
+	dtrace_bufdesc_t *buf;
 
 	dtp->dt_beganon = -1;
 
-#if defined(sun)
-	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
-#else
-	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) {
-#endif
-		/*
-		 * We really don't expect this to fail, but it is at least
-		 * technically possible for this to fail with ENOENT.  In this
-		 * case, we just drive on...
-		 */
-		if (errno == ENOENT)
-			return (0);
-
-		return (dt_set_errno(dtp, errno));
-	}
+	if (dt_get_buf(dtp, cpu, &buf) != 0)
+		return (-1);
+	if (buf == NULL)
+		return (0);
 
 	if (!dtp->dt_stopped || buf->dtbd_cpu != dtp->dt_endedon) {
 		/*
@@ -2599,7 +2706,10 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE
 		 * we are, we actually processed any END probes on another
 		 * CPU.  We can simply consume this buffer and return.
 		 */
-		return (dt_consume_cpu(dtp, fp, cpu, buf, pf, rf, arg));
+		rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE,
+		    pf, rf, arg);
+		dt_put_buf(dtp, buf);
+		return (rval);
 	}
 
 	begin.dtbgn_probefunc = pf;
@@ -2616,61 +2726,41 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE
 	dtp->dt_errhdlr = dt_consume_begin_error;
 	dtp->dt_errarg = &begin;
 
-	rval = dt_consume_cpu(dtp, fp, cpu, buf, dt_consume_begin_probe,
-	    dt_consume_begin_record, &begin);
+	rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE,
+	    dt_consume_begin_probe, dt_consume_begin_record, &begin);
 
 	dtp->dt_errhdlr = begin.dtbgn_errhdlr;
 	dtp->dt_errarg = begin.dtbgn_errarg;
 
-	if (rval != 0)
+	if (rval != 0) {
+		dt_put_buf(dtp, buf);
 		return (rval);
-
-	/*
-	 * Now allocate a new buffer.  We'll use this to deal with every other
-	 * CPU.
-	 */
-	bzero(&nbuf, sizeof (dtrace_bufdesc_t));
-	(void) dtrace_getopt(dtp, "bufsize", &size);
-	if ((nbuf.dtbd_data = malloc(size)) == NULL)
-		return (dt_set_errno(dtp, EDT_NOMEM));
+	}
 
 	if (max_ncpus == 0)
 		max_ncpus = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
 
 	for (i = 0; i < max_ncpus; i++) {
-		nbuf.dtbd_cpu = i;
-
+		dtrace_bufdesc_t *nbuf;
 		if (i == cpu)
 			continue;
 
-#if defined(sun)
-		if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &nbuf) == -1) {
-#else
-		pbuf = &nbuf;
-		if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &pbuf) == -1) {
-#endif
-			/*
-			 * If we failed with ENOENT, it may be because the
-			 * CPU was unconfigured -- this is okay.  Any other
-			 * error, however, is unexpected.
-			 */
-			if (errno == ENOENT)
-				continue;
-
-			free(nbuf.dtbd_data);
-
-			return (dt_set_errno(dtp, errno));
+		if (dt_get_buf(dtp, i, &nbuf) != 0) {
+			dt_put_buf(dtp, buf);
+			return (-1);
 		}
+		if (nbuf == NULL)
+			continue;
 
-		if ((rval = dt_consume_cpu(dtp, fp,
-		    i, &nbuf, pf, rf, arg)) != 0) {
-			free(nbuf.dtbd_data);
+		rval = dt_consume_cpu(dtp, fp, i, nbuf, B_FALSE,
+		    pf, rf, arg);
+		dt_put_buf(dtp, nbuf);
+		if (rval != 0) {
+			dt_put_buf(dtp, buf);
 			return (rval);
 		}
 	}
 
-	free(nbuf.dtbd_data);
-
 	/*
 	 * Okay -- we're done with the other buffers.  Now we want to
 	 * reconsume the first buffer -- but this time we're looking for
@@ -2685,8 +2775,8 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE
 	dtp->dt_errhdlr = dt_consume_begin_error;
 	dtp->dt_errarg = &begin;
 
-	rval = dt_consume_cpu(dtp, fp, cpu, buf, dt_consume_begin_probe,
-	    dt_consume_begin_record, &begin);
+	rval = dt_consume_cpu(dtp, fp, cpu, buf, B_FALSE,
+	    dt_consume_begin_probe, dt_consume_begin_record, &begin);
 
 	dtp->dt_errhdlr = begin.dtbgn_errhdlr;
 	dtp->dt_errarg = begin.dtbgn_errarg;
@@ -2694,11 +2784,32 @@ dt_consume_begin(dtrace_hdl_t *dtp, FILE
 	return (rval);
 }
 
+/* ARGSUSED */
+static uint64_t
+dt_buf_oldest(void *elem, void *arg)
+{
+	dtrace_bufdesc_t *buf = elem;
+	size_t offs = buf->dtbd_oldest;
+
+	while (offs < buf->dtbd_size) {
+		dtrace_rechdr_t *dtrh =
+		    /* LINTED - alignment */
+		    (dtrace_rechdr_t *)(buf->dtbd_data + offs);
+		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+			offs += sizeof (dtrace_epid_t);
+		} else {
+			return (DTRACE_RECORD_LOAD_TIMESTAMP(dtrh));
+		}
+	}
+
+	/* There are no records left; use the time the buffer was retrieved. */
+	return (buf->dtbd_timestamp);
+}
+
 int
 dtrace_consume(dtrace_hdl_t *dtp, FILE *fp,
     dtrace_consume_probe_f *pf, dtrace_consume_rec_f *rf, void *arg)
 {
-	dtrace_bufdesc_t *buf = &dtp->dt_buf;
 	dtrace_optval_t size;
 	static int max_ncpus;
 	int i, rval;
@@ -2726,79 +2837,158 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *
 	if (rf == NULL)
 		rf = (dtrace_consume_rec_f *)dt_nullrec;
 
-	if (buf->dtbd_data == NULL) {
-		(void) dtrace_getopt(dtp, "bufsize", &size);
-		if ((buf->dtbd_data = malloc(size)) == NULL)
-			return (dt_set_errno(dtp, EDT_NOMEM));
-
-		buf->dtbd_size = size;
-	}
-
-	/*
-	 * If we have just begun, we want to first process the CPU that
-	 * executed the BEGIN probe (if any).
-	 */
-	if (dtp->dt_active && dtp->dt_beganon != -1) {
-		buf->dtbd_cpu = dtp->dt_beganon;
-		if ((rval = dt_consume_begin(dtp, fp, buf, pf, rf, arg)) != 0)
-			return (rval);
-	}
-
-	for (i = 0; i < max_ncpus; i++) {
-		buf->dtbd_cpu = i;
-
+	if (dtp->dt_options[DTRACEOPT_TEMPORAL] == DTRACEOPT_UNSET) {
 		/*
-		 * If we have stopped, we want to process the CPU on which the
-		 * END probe was processed only _after_ we have processed
-		 * everything else.
+		 * The output will not be in the order it was traced.  Rather,
+		 * we will consume all of the data from each CPU's buffer in
+		 * turn.  We apply special handling for the records from BEGIN
+		 * and END probes so that they are consumed first and last,
+		 * respectively.
+		 *
+		 * If we have just begun, we want to first process the CPU that
+		 * executed the BEGIN probe (if any).
 		 */
-		if (dtp->dt_stopped && (i == dtp->dt_endedon))
-			continue;
+		if (dtp->dt_active && dtp->dt_beganon != -1 &&
+		    (rval = dt_consume_begin(dtp, fp, pf, rf, arg)) != 0)
+			return (rval);
+
+		for (i = 0; i < max_ncpus; i++) {
+			dtrace_bufdesc_t *buf;
 
-#if defined(sun)
-		if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
-#else
-		if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) {
-#endif
 			/*
-			 * If we failed with ENOENT, it may be because the
-			 * CPU was unconfigured -- this is okay.  Any other
-			 * error, however, is unexpected.
+			 * If we have stopped, we want to process the CPU on
+			 * which the END probe was processed only _after_ we
+			 * have processed everything else.
 			 */
-			if (errno == ENOENT)
+			if (dtp->dt_stopped && (i == dtp->dt_endedon))
 				continue;
 
-			return (dt_set_errno(dtp, errno));
+			if (dt_get_buf(dtp, i, &buf) != 0)
+				return (-1);
+			if (buf == NULL)
+				continue;
+
+			dtp->dt_flow = 0;
+			dtp->dt_indent = 0;
+			dtp->dt_prefix = NULL;
+			rval = dt_consume_cpu(dtp, fp, i,
+			    buf, B_FALSE, pf, rf, arg);
+			dt_put_buf(dtp, buf);
+			if (rval != 0)
+				return (rval);
 		}
+		if (dtp->dt_stopped) {
+			dtrace_bufdesc_t *buf;
 
-		if ((rval = dt_consume_cpu(dtp, fp, i, buf, pf, rf, arg)) != 0)
+			if (dt_get_buf(dtp, dtp->dt_endedon, &buf) != 0)
+				return (-1);
+			if (buf == NULL)
+				return (0);
+
+			rval = dt_consume_cpu(dtp, fp, dtp->dt_endedon,
+			    buf, B_FALSE, pf, rf, arg);
+			dt_put_buf(dtp, buf);
 			return (rval);
-	}
+		}
+	} else {
+		/*
+		 * The output will be in the order it was traced (or for
+		 * speculations, when it was committed).  We retrieve a buffer
+		 * from each CPU and put it into a priority queue, which sorts
+		 * based on the first entry in the buffer.  This is sufficient
+		 * because entries within a buffer are already sorted.
+		 *
+		 * We then consume records one at a time, always consuming the
+		 * oldest record, as determined by the priority queue.  When
+		 * we reach the end of the time covered by these buffers,
+		 * we need to stop and retrieve more records on the next pass.
+		 * The kernel tells us the time covered by each buffer, in
+		 * dtbd_timestamp.  The first buffer's timestamp tells us the
+		 * time covered by all buffers, as subsequently retrieved
+		 * buffers will cover to a more recent time.
+		 */
 
-	if (!dtp->dt_stopped)
-		return (0);
+		uint64_t *drops = alloca(max_ncpus * sizeof (uint64_t));
+		uint64_t first_timestamp = 0;
+		uint_t cookie = 0;
+		dtrace_bufdesc_t *buf;
+
+		bzero(drops, max_ncpus * sizeof (uint64_t));
+
+		if (dtp->dt_bufq == NULL) {
+			dtp->dt_bufq = dt_pq_init(dtp, max_ncpus * 2,
+			    dt_buf_oldest, NULL);
+			if (dtp->dt_bufq == NULL) /* ENOMEM */
+				return (-1);
+		}
 
-	buf->dtbd_cpu = dtp->dt_endedon;
+		/* Retrieve data from each CPU. */
+		(void) dtrace_getopt(dtp, "bufsize", &size);
+		for (i = 0; i < max_ncpus; i++) {
+			dtrace_bufdesc_t *buf;
+
+			if (dt_get_buf(dtp, i, &buf) != 0)
+				return (-1);
+			if (buf != NULL) {
+				if (first_timestamp == 0)
+					first_timestamp = buf->dtbd_timestamp;
+				assert(buf->dtbd_timestamp >= first_timestamp);
+
+				dt_pq_insert(dtp->dt_bufq, buf);
+				drops[i] = buf->dtbd_drops;
+				buf->dtbd_drops = 0;
+			}
+		}
+
+		/* Consume records. */
+		for (;;) {
+			dtrace_bufdesc_t *buf = dt_pq_pop(dtp->dt_bufq);
+			uint64_t timestamp;
+
+			if (buf == NULL)
+				break;
+
+			timestamp = dt_buf_oldest(buf, dtp);
+			assert(timestamp >= dtp->dt_last_timestamp);
+			dtp->dt_last_timestamp = timestamp;
+
+			if (timestamp == buf->dtbd_timestamp) {
+				/*
+				 * We've reached the end of the time covered
+				 * by this buffer.  If this is the oldest
+				 * buffer, we must do another pass
+				 * to retrieve more data.
+				 */
+				dt_put_buf(dtp, buf);
+				if (timestamp == first_timestamp &&
+				    !dtp->dt_stopped)
+					break;
+				continue;
+			}
+
+			if ((rval = dt_consume_cpu(dtp, fp,
+			    buf->dtbd_cpu, buf, B_TRUE, pf, rf, arg)) != 0)
+				return (rval);
+			dt_pq_insert(dtp->dt_bufq, buf);
+		}
+
+		/* Consume drops. */
+		for (i = 0; i < max_ncpus; i++) {
+			if (drops[i] != 0) {
+				int error = dt_handle_cpudrop(dtp, i,
+				    DTRACEDROP_PRINCIPAL, drops[i]);
+				if (error != 0)
+					return (error);
+			}
+		}
 
-#if defined(sun)
-	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, buf) == -1) {
-#else
-	if (dt_ioctl(dtp, DTRACEIOC_BUFSNAP, &buf) == -1) {
-#endif
 		/*
-		 * This _really_ shouldn't fail, but it is strictly speaking
-		 * possible for this to return ENOENT if the CPU that called
-		 * the END enabling somehow managed to become unconfigured.
-		 * It's unclear how the user can possibly expect anything
-		 * rational to happen in this case -- the state has been thrown
-		 * out along with the unconfigured CPU -- so we'll just drive
-		 * on...
+		 * Reduce memory usage by re-allocating smaller buffers
+		 * for the "remnants".
 		 */
-		if (errno == ENOENT)
-			return (0);
-
-		return (dt_set_errno(dtp, errno));
+		while (buf = dt_pq_walk(dtp->dt_bufq, &cookie))
+			dt_realloc_buf(dtp, buf, buf->dtbd_size);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***