PERFORCE change 129713 for review

Wed Nov 28 11:09:09 PST 2007

http://perforce.freebsd.org/chv.cgi?CH=129713

Change 129713 by peter at peter_overcee on 2007/11/28 19:08:51

	IFC @129711

Affected files ...

.. //depot/projects/hammer/ObsoleteFiles.inc#41 integrate
.. //depot/projects/hammer/UPDATING#111 integrate
.. //depot/projects/hammer/etc/defaults/periodic.conf#21 integrate
.. //depot/projects/hammer/etc/gss/mech#2 integrate
.. //depot/projects/hammer/gnu/usr.bin/groff/tmac/mdoc.local#29 integrate
.. //depot/projects/hammer/lib/libc/gen/Symbol.map#6 integrate
.. //depot/projects/hammer/lib/libc/stdlib/malloc.3#16 integrate
.. //depot/projects/hammer/lib/libc/stdlib/malloc.c#37 integrate
.. //depot/projects/hammer/lib/libgssapi/gss_acquire_cred.c#2 integrate
.. //depot/projects/hammer/lib/libkse/kse.map#2 integrate
.. //depot/projects/hammer/lib/libkse/sys/lock.c#2 integrate
.. //depot/projects/hammer/lib/libkse/sys/lock.h#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_cond.c#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_init.c#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_kern.c#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_mutex.c#3 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_rtld.c#2 integrate
.. //depot/projects/hammer/lib/libthr/pthread.map#14 integrate
.. //depot/projects/hammer/lib/libthr/thread/thr_mutex.c#36 integrate
.. //depot/projects/hammer/release/doc/en_US.ISO8859-1/relnotes/article.sgml#26 integrate
.. //depot/projects/hammer/release/doc/share/sgml/release.ent#22 integrate
.. //depot/projects/hammer/sbin/mdconfig/mdconfig.8#21 integrate
.. //depot/projects/hammer/sbin/mount/mount.8#24 integrate
.. //depot/projects/hammer/sbin/newfs/newfs.8#13 integrate
.. //depot/projects/hammer/sbin/newfs/newfs.c#16 integrate
.. //depot/projects/hammer/sbin/newfs/newfs.h#9 integrate
.. //depot/projects/hammer/sbin/sysctl/sysctl.8#15 integrate
.. //depot/projects/hammer/share/man/man4/agp.4#8 integrate
.. //depot/projects/hammer/share/man/man4/ichsmb.4#4 integrate
.. //depot/projects/hammer/share/man/man4/nfe.4#6 integrate
.. //depot/projects/hammer/share/man/man4/rl.4#18 integrate
.. //depot/projects/hammer/share/man/man9/Makefile#75 integrate
.. //depot/projects/hammer/share/man/man9/stack.9#1 branch
.. //depot/projects/hammer/share/misc/iso3166#8 integrate
.. //depot/projects/hammer/share/mk/sys.mk#27 integrate
.. //depot/projects/hammer/sys/amd64/amd64/busdma_machdep.c#46 integrate
.. //depot/projects/hammer/sys/amd64/conf/GENERIC#100 integrate
.. //depot/projects/hammer/sys/arm/arm/busdma_machdep.c#26 integrate
.. //depot/projects/hammer/sys/arm/include/atomic.h#18 integrate
.. //depot/projects/hammer/sys/conf/NOTES#129 integrate
.. //depot/projects/hammer/sys/conf/options#116 integrate
.. //depot/projects/hammer/sys/dev/an/if_an.c#38 integrate
.. //depot/projects/hammer/sys/dev/sound/pci/hda/hdac.c#12 integrate
.. //depot/projects/hammer/sys/dev/wpi/if_wpi.c#3 integrate
.. //depot/projects/hammer/sys/dev/wpi/if_wpireg.h#2 integrate
.. //depot/projects/hammer/sys/i386/conf/GENERIC#58 integrate
.. //depot/projects/hammer/sys/i386/conf/XBOX#6 integrate
.. //depot/projects/hammer/sys/i386/i386/busdma_machdep.c#35 integrate
.. //depot/projects/hammer/sys/ia64/ia64/busdma_machdep.c#22 integrate
.. //depot/projects/hammer/sys/ia64/include/atomic.h#6 integrate
.. //depot/projects/hammer/sys/kern/kern_mutex.c#51 integrate
.. //depot/projects/hammer/sys/kern/kern_rwlock.c#14 integrate
.. //depot/projects/hammer/sys/netinet/tcp_output.c#47 integrate
.. //depot/projects/hammer/sys/netinet/tcp_subr.c#79 integrate
.. //depot/projects/hammer/sys/powerpc/include/atomic.h#11 integrate
.. //depot/projects/hammer/sys/sparc64/conf/GENERIC#55 integrate
.. //depot/projects/hammer/sys/sun4v/conf/GENERIC#6 integrate
.. //depot/projects/hammer/sys/sys/signal.h#17 integrate
.. //depot/projects/hammer/usr.sbin/newsyslog/newsyslog.conf.5#6 integrate

Differences ...

==== //depot/projects/hammer/ObsoleteFiles.inc#41 (text+ko) ====

@@ -1,5 +1,5 @@
 #
-# $FreeBSD: src/ObsoleteFiles.inc,v 1.120 2007/11/21 10:49:33 ru Exp $
+# $FreeBSD: src/ObsoleteFiles.inc,v 1.121 2007/11/27 13:58:25 brix Exp $
 #
 # This file lists old files (OLD_FILES), libraries (OLD_LIBS) and
 # directories (OLD_DIRS) which should get removed at an update. Recently
@@ -3964,9 +3964,10 @@
 #  - usr/share/tmac/mm/se_locale
 #  - var/yp/Makefile
 
-# 20071119: shared library version bump
+# 20071120: shared library version bump
 OLD_LIBS+=usr/lib/libasn1.so.8
 OLD_LIBS+=usr/lib/libgssapi.so.8
+OLD_LIBS+=usr/lib/libgssapi_krb5.so.8
 OLD_LIBS+=usr/lib/libhdb.so.8
 OLD_LIBS+=usr/lib/libkadm5clnt.so.8
 OLD_LIBS+=usr/lib/libkadm5srv.so.8

==== //depot/projects/hammer/UPDATING#111 (text+ko) ====

@@ -21,6 +21,10 @@
 	developers choose to disable these features on build machines
 	to maximize performance.
 
+20071128:
+	The ADAPTIVE_GIANT kernel option has been retired because its
+	functionality is the default now.
+
 20071118:
 	The AT keyboard emulation of sunkbd(4) has been turned on
 	by default. In order to make the special symbols of the Sun
@@ -945,4 +949,4 @@
 Contact Warner Losh if you have any questions about your use of
 this document.
 
-$FreeBSD: src/UPDATING,v 1.512 2007/11/18 18:11:16 marius Exp $
+$FreeBSD: src/UPDATING,v 1.513 2007/11/28 13:04:11 matteo Exp $

==== //depot/projects/hammer/etc/defaults/periodic.conf#21 (text+ko) ====

@@ -13,7 +13,7 @@
 # For a more detailed explanation of all the periodic.conf variables, please
 # refer to the periodic.conf(5) manual page.
 #
-# $FreeBSD: src/etc/defaults/periodic.conf,v 1.44 2007/05/29 06:22:13 dougb Exp $
+# $FreeBSD: src/etc/defaults/periodic.conf,v 1.45 2007/11/28 17:31:11 jhb Exp $
 #
 
 # What files override these defaults ?
@@ -45,7 +45,9 @@
 daily_clean_tmps_enable="NO"				# Delete stuff daily
 daily_clean_tmps_dirs="/tmp"				# Delete under here
 daily_clean_tmps_days="3"				# If not accessed for
-daily_clean_tmps_ignore=".X*-lock quota.user quota.group" # Don't delete these
+daily_clean_tmps_ignore=".X*-lock .X11-unix .ICE-unix .font-unix .XIM-unix"
+daily_clean_tmps_ignore="$daily_clean_tmps_ignore quota.user quota.group"
+							# Don't delete these
 daily_clean_tmps_verbose="YES"				# Mention files deleted
 
 # 120.clean-preserve

==== //depot/projects/hammer/etc/gss/mech#2 (text+ko) ====

@@ -1,4 +1,4 @@
-# $FreeBSD: src/etc/gss/mech,v 1.1 2005/12/29 14:40:18 dfr Exp $
+# $FreeBSD: src/etc/gss/mech,v 1.2 2007/11/27 21:47:56 jhb Exp $
 #
 # Name		OID			Library name			Kernel module
-kerberosv5	1.2.840.113554.1.2.2	/usr/lib/libgssapi_krb5.so.8	-
+kerberosv5	1.2.840.113554.1.2.2	/usr/lib/libgssapi_krb5.so.9	-

==== //depot/projects/hammer/gnu/usr.bin/groff/tmac/mdoc.local#29 (text+ko) ====

@@ -22,7 +22,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\" $FreeBSD: src/gnu/usr.bin/groff/tmac/mdoc.local,v 1.61 2007/10/22 10:01:58 ru Exp $
+.\" $FreeBSD: src/gnu/usr.bin/groff/tmac/mdoc.local,v 1.62 2007/11/27 10:00:33 jkoshy Exp $
 .\"
 .\"     %beginstrip%
 .
@@ -52,7 +52,7 @@
 .ds doc-str-Lb-libmd       Message Digest (MD4, MD5, etc.) Support Library (libmd, \-lmd)
 .ds doc-str-Lb-libmemstat  Kernel Memory Allocator Statistics Library (libmemstat, \-lmemstat)
 .ds doc-str-Lb-libnetgraph Netgraph User Library (libnetgraph, \-lnetgraph)
-.ds doc-str-Lb-libpmc      Performance Monitoring Counters API (libpmc, \-lpmc)
+.ds doc-str-Lb-libpmc      Performance Monitoring Counters Interface Library (libpmc, \-lpmc)
 .ds doc-str-Lb-librpcsvc   RPC Service Library (librpcsvc, \-lrpcsvc)
 .ds doc-str-Lb-libsdp      Bluetooth Service Discovery Protocol User Library (libsdp, \-lsdp)
 .ds doc-str-Lb-libthr      1:1 Threading Library (libthr, \-lthr)

==== //depot/projects/hammer/lib/libc/gen/Symbol.map#6 (text) ====

@@ -1,5 +1,5 @@
 /*
- * $FreeBSD: src/lib/libc/gen/Symbol.map,v 1.6 2007/05/31 13:01:33 deischen Exp $
+ * $FreeBSD: src/lib/libc/gen/Symbol.map,v 1.7 2007/11/27 16:22:21 jasone Exp $
  */
 
 FBSD_1.0 {
@@ -378,6 +378,7 @@
 	_pthread_kill;
 	_pthread_main_np;
 	_pthread_mutex_destroy;
+	_pthread_mutex_init_calloc_cb;
 	_pthread_mutex_init;
 	_pthread_mutex_lock;
 	_pthread_mutex_trylock;

==== //depot/projects/hammer/lib/libc/stdlib/malloc.3#16 (text+ko) ====

@@ -30,9 +30,9 @@
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
-.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.73 2007/06/15 22:32:33 jasone Exp $
+.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.74 2007/11/27 03:18:26 jasone Exp $
 .\"
-.Dd June 15, 2007
+.Dd October 1, 2007
 .Dt MALLOC 3
 .Os
 .Sh NAME
@@ -177,6 +177,19 @@
 The process will call
 .Xr abort 3
 in these cases.
+.It B
+Increase/decrease the per-arena lock contention threshold at which a thread is
+randomly re-assigned to an arena.
+This dynamic load balancing tends to push threads away from highly contended
+arenas, which avoids worst case contention scenarios in which threads
+disproportionately utilize arenas.
+However, due to the highly dynamic load that applications may place on the
+allocator, it is impossible for the allocator to know in advance how sensitive
+it should be to contention over arenas.
+Therefore, some applications may benefit from increasing or decreasing this
+threshold parameter.
+This option is not available for some configurations (non-PIC).
+This option can be specified multiple times.
 .It H
 Use
 .Xr madvise 2
@@ -204,6 +217,18 @@
 Increase/decrease the virtual memory chunk size by a factor of two.
 The default chunk size is 1 MB.
 This option can be specified multiple times.
+.It L
+Increase/decrease the per-arena number of slots for lazy deallocation.
+Lazy deallocation can decrease lock contention, especially for programs that use
+the producer/consumer model.
+The default is 256 slots per arena (so
+.Ev MALLOC_OPTIONS=lllllllll
+will disable lazy deallocation), but note that due to algorithmic details, the
+cache is typically flushed well before it completely fills.
+This option has no impact unless there are multiple CPUs, and lazy deallocation
+does not activate unless the program uses multiple threads.
+This option is not available for some configurations (non-PIC).
+This option can be specified multiple times.
 .It N
 Increase/decrease the number of arenas by a factor of two.
 The default number of arenas is four times the number of CPUs, or one if there

==== //depot/projects/hammer/lib/libc/stdlib/malloc.c#37 (text+ko) ====

@@ -98,14 +98,14 @@
  * defaults the A and J runtime options to off.  These settings are appropriate
  * for production systems.
  */
-/* #define MALLOC_PRODUCTION */
+/* #define	MALLOC_PRODUCTION */
 
 #ifndef MALLOC_PRODUCTION
 #  define MALLOC_DEBUG
 #endif
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.147 2007/06/15 22:00:16 jasone Exp $");
+__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.152 2007/11/28 00:17:34 jasone Exp $");
 
 #include "libc_private.h"
 #ifdef MALLOC_DEBUG
@@ -171,6 +171,7 @@
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	2
 #  define USE_BRK
+#  define CPU_SPINWAIT		__asm__ volatile("pause")
 #endif
 #ifdef __ia64__
 #  define QUANTUM_2POW_MIN	4
@@ -189,6 +190,7 @@
 #ifdef __amd64__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
+#  define CPU_SPINWAIT		__asm__ volatile("pause")
 #endif
 #ifdef __arm__
 #  define QUANTUM_2POW_MIN	3
@@ -202,9 +204,9 @@
 #  define USE_BRK
 #endif
 
-#define	SIZEOF_PTR		(1 << SIZEOF_PTR_2POW)
+#define	SIZEOF_PTR		(1U << SIZEOF_PTR_2POW)
 
-/* sizeof(int) == (1 << SIZEOF_INT_2POW). */
+/* sizeof(int) == (1U << SIZEOF_INT_2POW). */
 #ifndef SIZEOF_INT_2POW
 #  define SIZEOF_INT_2POW	2
 #endif
@@ -226,7 +228,7 @@
  * negatively affect performance.
  */
 #define	CACHELINE_2POW		6
-#define	CACHELINE		((size_t)(1 << CACHELINE_2POW))
+#define	CACHELINE		((size_t)(1U << CACHELINE_2POW))
 
 /* Smallest size class to support. */
 #define	TINY_MIN_2POW		1
@@ -237,7 +239,7 @@
  * power of 2.
  */
 #define	SMALL_MAX_2POW_DEFAULT	9
-#define	SMALL_MAX_DEFAULT	(1 << SMALL_MAX_2POW_DEFAULT)
+#define	SMALL_MAX_DEFAULT	(1U << SMALL_MAX_2POW_DEFAULT)
 
 /*
  * Maximum desired run header overhead.  Runs are sized as small as possible
@@ -252,18 +254,69 @@
  * RUN_MAX_OVRHD_RELAX specifies the maximum number of bits per region of
  * overhead for which RUN_MAX_OVRHD is relaxed.
  */
-#define RUN_MAX_OVRHD		0.015
-#define RUN_MAX_OVRHD_RELAX	1.5
+#define	RUN_MAX_OVRHD		0.015
+#define	RUN_MAX_OVRHD_RELAX	1.5
 
 /* Put a cap on small object run size.  This overrides RUN_MAX_OVRHD. */
-#define RUN_MAX_SMALL_2POW	15
-#define RUN_MAX_SMALL		(1 << RUN_MAX_SMALL_2POW)
+#define	RUN_MAX_SMALL_2POW	15
+#define	RUN_MAX_SMALL		(1U << RUN_MAX_SMALL_2POW)
+
+/* Default size of each arena's lazy free cache. */
+#define	LAZY_FREE_2POW_DEFAULT	8
+/*
+ * Number of pseudo-random probes to conduct before considering the cache to be
+ * overly full.  It takes on average n probes to detect fullness of (n-1)/n.
+ * However, we are effectively doing multiple non-independent trials (each
+ * deallocation is a trial), so the actual average threshold for clearing the
+ * cache is somewhat lower.
+ */
+#define	LAZY_FREE_NPROBES	5
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.  If no such instruction is defined
+ * above, make CPU_SPINWAIT a no-op.
+ */
+#ifndef CPU_SPINWAIT
+#  define CPU_SPINWAIT
+#endif
+
+/*
+ * Adaptive spinning must eventually switch to blocking, in order to avoid the
+ * potential for priority inversion deadlock.  Backing off past a certain point
+ * can actually waste time.
+ */
+#define	SPIN_LIMIT_2POW		11
+
+/*
+ * Conversion from spinning to blocking is expensive; we use (1U <<
+ * BLOCK_COST_2POW) to estimate how many more times costly blocking is than
+ * worst-case spinning.
+ */
+#define	BLOCK_COST_2POW		4
+
+/*
+ * We use an exponential moving average to track recent lock contention, where
+ * the size of the history window is N, and alpha=2/(N+1).
+ *
+ * Due to integer math rounding, very small values here can cause substantial
+ * degradation in accuracy, thus making the moving average decay faster than it
+ * would with precise calculation.
+ */
+#define	BALANCE_ALPHA_INV_2POW	9
+
+/*
+ * Threshold value for the exponential moving contention average at which to
+ * re-assign a thread.
+ */
+#define	BALANCE_THRESHOLD_DEFAULT	(1U << (SPIN_LIMIT_2POW-4))
 
 /******************************************************************************/
 
 /*
- * Mutexes based on spinlocks.  We can't use normal pthread mutexes, because
- * they require malloc()ed memory.
+ * Mutexes based on spinlocks.  We can't use normal pthread spinlocks in all
+ * places, because they require malloc()ed memory, which causes bootstrapping
+ * issues in some cases.
  */
 typedef struct {
 	spinlock_t	lock;
@@ -319,6 +372,11 @@
 	size_t		allocated_large;
 	uint64_t	nmalloc_large;
 	uint64_t	ndalloc_large;
+
+#ifndef NO_TLS
+	/* Number of times this arena reassigned a thread due to contention. */
+	uint64_t	nbalance;
+#endif
 };
 
 typedef struct chunk_stats_s chunk_stats_t;
@@ -374,17 +432,27 @@
 
 typedef struct arena_chunk_map_s arena_chunk_map_t;
 struct arena_chunk_map_s {
-	/* Number of pages in run. */
+	/*
+	 * Number of pages in run.  For a free run that has never been touched,
+	 * this is NPAGES_EMPTY for the central pages, which allows us to avoid
+	 * zero-filling untouched pages for calloc().
+	 */
+#define	NPAGES_EMPTY ((uint32_t)0x0U)
 	uint32_t	npages;
 	/*
-	 * Position within run.  For a free run, this is POS_FREE for the first
-	 * and last pages.  The POS_FREE special value makes it possible to
-	 * quickly coalesce free runs.
+	 * Position within run.  For a free run, this is POS_EMPTY/POS_FREE for
+	 * the first and last pages.  The special values make it possible to
+	 * quickly coalesce free runs.  POS_EMPTY indicates that the run has
+	 * never been touched, which allows us to avoid zero-filling untouched
+	 * pages for calloc().
 	 *
 	 * This is the limiting factor for chunksize; there can be at most 2^31
 	 * pages in a run.
+	 *
+	 * POS_EMPTY is assumed by arena_run_dalloc() to be less than POS_FREE.
 	 */
-#define POS_FREE ((uint32_t)0xffffffffU)
+#define	POS_EMPTY ((uint32_t)0xfffffffeU)
+#define	POS_FREE ((uint32_t)0xffffffffU)
 	uint32_t	pos;
 };
 
@@ -496,8 +564,8 @@
 #  define ARENA_MAGIC 0x947d3d24
 #endif
 
-	/* All operations on this arena require that mtx be locked. */
-	malloc_mutex_t		mtx;
+	/* All operations on this arena require that lock be locked. */
+	pthread_mutex_t		lock;
 
 #ifdef MALLOC_STATS
 	arena_stats_t		stats;
@@ -517,7 +585,23 @@
 	 * order to avoid interactions between multiple threads that could make
 	 * a single spare inadequate.
 	 */
-	arena_chunk_t *spare;
+	arena_chunk_t		*spare;
+
+#ifndef NO_TLS
+	/*
+	 * The arena load balancing machinery needs to keep track of how much
+	 * lock contention there is.  This value is exponentially averaged.
+	 */
+	uint32_t		contention;
+
+	/*
+	 * Deallocation of small objects can be lazy, in which case free_cache
+	 * stores pointers to those objects that have not yet been deallocated.
+	 * In order to avoid lock contention, slots are chosen randomly.  Empty
+	 * slots contain NULL.
+	 */
+	void			**free_cache;
+#endif
 
 	/*
 	 * bins is used to store rings of free regions of the following sizes,
@@ -650,9 +734,9 @@
 static arena_t		**arenas;
 static unsigned		narenas;
 #ifndef NO_TLS
-static unsigned		next_arena;
+static unsigned		narenas_2pow;
 #endif
-static malloc_mutex_t	arenas_mtx; /* Protects arenas initialization. */
+static pthread_mutex_t arenas_lock; /* Protects arenas initialization. */
 
 #ifndef NO_TLS
 /*
@@ -681,6 +765,10 @@
 static bool	opt_junk = false;
 #endif
 static bool	opt_hint = false;
+#ifndef NO_TLS
+static int	opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT;
+static uint64_t	opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
+#endif
 static bool	opt_print_stats = false;
 static size_t	opt_quantum_2pow = QUANTUM_2POW_MIN;
 static size_t	opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
@@ -689,7 +777,7 @@
 static bool	opt_sysv = false;
 static bool	opt_xmalloc = false;
 static bool	opt_zero = false;
-static int32_t	opt_narenas_lshift = 0;
+static int	opt_narenas_lshift = 0;
 
 typedef struct {
 	void	*p;
@@ -709,6 +797,7 @@
  */
 
 static void	malloc_mutex_init(malloc_mutex_t *a_mutex);
+static bool	malloc_spin_init(pthread_mutex_t *lock);
 static void	wrtmessage(const char *p1, const char *p2, const char *p3,
 		const char *p4);
 #ifdef MALLOC_STATS
@@ -717,6 +806,7 @@
 static char	*umax2s(uintmax_t x, char *s);
 static bool	base_pages_alloc(size_t minsize);
 static void	*base_alloc(size_t size);
+static void	*base_calloc(size_t number, size_t size);
 static chunk_node_t *base_chunk_node_alloc(void);
 static void	base_chunk_node_dealloc(chunk_node_t *node);
 #ifdef MALLOC_STATS
@@ -729,15 +819,16 @@
 #ifndef NO_TLS
 static arena_t	*choose_arena_hard(void);
 #endif
-static void	arena_run_split(arena_t *arena, arena_run_t *run, size_t size);
+static void	arena_run_split(arena_t *arena, arena_run_t *run, size_t size,
+    bool zero);
 static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
 static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
-static arena_run_t *arena_run_alloc(arena_t *arena, size_t size);
+static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool zero);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, size_t size);
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
 static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
 static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
-static void	*arena_malloc(arena_t *arena, size_t size);
+static void	*arena_malloc(arena_t *arena, size_t size, bool zero);
 static void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
     size_t alloc_size);
 static size_t	arena_salloc(const void *ptr);
@@ -745,7 +836,7 @@
 static void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
 static bool	arena_new(arena_t *arena);
 static arena_t	*arenas_extend(unsigned ind);
-static void	*huge_malloc(size_t size);
+static void	*huge_malloc(size_t size, bool zero);
 static void	*huge_palloc(size_t alignment, size_t size);
 static void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
 static void	huge_dalloc(void *ptr);
@@ -763,7 +854,9 @@
  */
 /******************************************************************************/
 /*
- * Begin mutex.
+ * Begin mutex.  We can't use normal pthread mutexes in all places, because
+ * they require malloc()ed memory, which causes bootstrapping issues in some
+ * cases.
  */
 
 static void
@@ -795,6 +888,86 @@
  */
 /******************************************************************************/
 /*
+ * Begin spin lock.  Spin locks here are actually adaptive mutexes that block
+ * after a period of spinning, because unbounded spinning would allow for
+ * priority inversion.
+ */
+
+/*
+ * We use an unpublished interface to initialize pthread mutexes with an
+ * allocation callback, in order to avoid infinite recursion.
+ */
+int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
+    void *(calloc_cb)(size_t, size_t));
+
+__weak_reference(_pthread_mutex_init_calloc_cb_stub,
+    _pthread_mutex_init_calloc_cb);
+
+int
+_pthread_mutex_init_calloc_cb_stub(pthread_mutex_t *mutex,
+    void *(calloc_cb)(size_t, size_t))
+{
+
+	return (0);
+}
+
+static bool
+malloc_spin_init(pthread_mutex_t *lock)
+{
+
+	if (_pthread_mutex_init_calloc_cb(lock, base_calloc) != 0)
+		return (true);
+
+	return (false);
+}
+
+static inline unsigned
+malloc_spin_lock(pthread_mutex_t *lock)
+{
+	unsigned ret = 0;
+
+	if (__isthreaded) {
+		if (_pthread_mutex_trylock(lock) != 0) {
+			unsigned i;
+			volatile unsigned j;
+
+			/* Exponentially back off. */
+			for (i = 1; i <= SPIN_LIMIT_2POW; i++) {
+				for (j = 0; j < (1U << i); j++)
+					ret++;
+
+				CPU_SPINWAIT;
+				if (_pthread_mutex_trylock(lock) == 0)
+					return (ret);
+			}
+
+			/*
+			 * Spinning failed.  Block until the lock becomes
+			 * available, in order to avoid indefinite priority
+			 * inversion.
+			 */
+			_pthread_mutex_lock(lock);
+			assert((ret << BLOCK_COST_2POW) != 0);
+			return (ret << BLOCK_COST_2POW);
+		}
+	}
+
+	return (ret);
+}
+
+static inline void
+malloc_spin_unlock(pthread_mutex_t *lock)
+{
+
+	if (__isthreaded)
+		_pthread_mutex_unlock(lock);
+}
+
+/*
+ * End spin lock.
+ */
+/******************************************************************************/
+/*
  * Begin Utility functions/macros.
  */
 
@@ -840,6 +1013,63 @@
 	return (x);
 }
 
+#ifndef NO_TLS
+/*
+ * Use a simple linear congruential pseudo-random number generator:
+ *
+ *   prn(y) = (a*x + c) % m
+ *
+ * where the following constants ensure maximal period:
+ *
+ *   a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4.
+ *   c == Odd number (relatively prime to 2^n).
+ *   m == 2^32
+ *
+ * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
+ *
+ * This choice of m has the disadvantage that the quality of the bits is
+ * proportional to bit position.  For example. the lowest bit has a cycle of 2,
+ * the next has a cycle of 4, etc.  For this reason, we prefer to use the upper
+ * bits.
+ */
+#define	PRN_DEFINE(suffix, var, a, c)					\
+static inline void							\
+sprn_##suffix(uint32_t seed)						\
+{									\
+	var = seed;							\
+}									\
+									\
+static inline uint32_t							\
+prn_##suffix(uint32_t lg_range)						\
+{									\
+	uint32_t ret, x;						\
+									\
+	assert(lg_range > 0);						\
+	assert(lg_range <= 32);						\
+									\
+	x = (var * (a)) + (c);						\
+	var = x;							\
+	ret = x >> (32 - lg_range);					\
+									\
+	return (ret);							\
+}
+#define	SPRN(suffix, seed)	sprn_##suffix(seed)
+#define	PRN(suffix, lg_range)	prn_##suffix(lg_range)
+
+/*
+ * Define PRNGs, one for each purpose, in order to avoid auto-correlation
+ * problems.
+ */
+
+/* Define the per-thread PRNG used for lazy deallocation. */
+static __thread uint32_t lazy_free_x;
+PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347)
+
+/* Define the PRNG used for arena assignment. */
+static __thread uint32_t balance_x;
+PRN_DEFINE(balance, balance_x, 1297, 1301);
+#endif
+
 static void
 wrtmessage(const char *p1, const char *p2, const char *p3, const char *p4)
 {
@@ -876,7 +1106,7 @@
  * integer printing functionality, so that malloc_printf() use can be limited to
  * MALLOC_STATS code.
  */
-#define UMAX2S_BUFSIZE	21
+#define	UMAX2S_BUFSIZE	21
 static char *
 umax2s(uintmax_t x, char *s)
 {
@@ -995,6 +1225,17 @@
 	return (ret);
 }
 
+static void *
+base_calloc(size_t number, size_t size)
+{
+	void *ret;
+
+	ret = base_alloc(number * size);
+	memset(ret, 0, number * size);
+
+	return (ret);
+}
+
 static chunk_node_t *
 base_chunk_node_alloc(void)
 {
@@ -1202,6 +1443,11 @@
 			    && (uintptr_t)chunk < (uintptr_t)brk_max) {
 				/* Re-use a previously freed brk chunk. */
 				ret = chunk;
+				/*
+				 * Maintain invariant that all newly allocated
+				 * chunks are untouched or zero-filled.
+				 */
+				memset(ret, 0, size);
 				goto RETURN;
 			}
 #endif
@@ -1444,8 +1690,10 @@
 	}
 
 	ret = arenas_map;
-	if (ret == NULL)
+	if (ret == NULL) {
 		ret = choose_arena_hard();
+		assert(ret != NULL);
+	}
 #else
 	if (__isthreaded) {
 		unsigned long ind;
@@ -1479,12 +1727,12 @@
 			 * Avoid races with another thread that may have already
 			 * initialized arenas[ind].
 			 */
-			malloc_mutex_lock(&arenas_mtx);
+			malloc_spin_lock(&arenas_lock);
 			if (arenas[ind] == NULL)
 				ret = arenas_extend((unsigned)ind);
 			else
 				ret = arenas[ind];
-			malloc_mutex_unlock(&arenas_mtx);
+			malloc_spin_unlock(&arenas_lock);
 		}
 	} else
 		ret = arenas[0];
@@ -1506,21 +1754,39 @@
 
 	assert(__isthreaded);
 
-	/* Assign one of the arenas to this thread, in a round-robin fashion. */
-	malloc_mutex_lock(&arenas_mtx);
-	ret = arenas[next_arena];
-	if (ret == NULL)
-		ret = arenas_extend(next_arena);
-	if (ret == NULL) {
-		/*
-		 * Make sure that this function never returns NULL, so that
-		 * choose_arena() doesn't have to check for a NULL return
-		 * value.
-		 */
+	/*
+	 * Seed the PRNG used for lazy deallocation.  Since seeding only occurs
+	 * on the first allocation by a thread, it is possible for a thread to
+	 * deallocate before seeding.  This is not a critical issue though,
+	 * since it is extremely unusual for an application to to use threads
+	 * that deallocate but *never* allocate, and because even if seeding
+	 * never occurs for multiple threads, they will tend to drift apart
+	 * unless some aspect of the application forces deallocation
+	 * synchronization.
+	 */
+	SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self()));
+
+	/*
+	 * Seed the PRNG used for arena load balancing.  We can get away with
+	 * using the same seed here as for the lazy_free PRNG without
+	 * introducing autocorrelation because the PRNG parameters are
+	 * distinct.
+	 */
+	SPRN(balance, (uint32_t)(uintptr_t)(_pthread_self()));
+
+	if (narenas > 1) {
+		unsigned ind;
+
+		ind = PRN(balance, narenas_2pow);
+		if ((ret = arenas[ind]) == NULL) {
+			malloc_spin_lock(&arenas_lock);
+			if ((ret = arenas[ind]) == NULL)
+				ret = arenas_extend(ind);
+			malloc_spin_unlock(&arenas_lock);
+		}
+	} else
 		ret = arenas[0];
-	}
-	next_arena = (next_arena + 1) % narenas;
-	malloc_mutex_unlock(&arenas_mtx);
+
 	arenas_map = ret;
 
 	return (ret);
@@ -1588,7 +1854,7 @@
 		    + (bin->reg_size * regind));
 
 		/* Clear bit. */
-		mask ^= (1 << bit);
+		mask ^= (1U << bit);
 		run->regs_mask[i] = mask;
 
 		return (ret);
@@ -1605,7 +1871,7 @@
 			    + (bin->reg_size * regind));
 
 			/* Clear bit. */
-			mask ^= (1 << bit);
+			mask ^= (1U << bit);
 			run->regs_mask[i] = mask;
 
 			/*
@@ -1635,8 +1901,8 @@
 	 *
 	 *   (X * size_invs[(D >> QUANTUM_2POW_MIN) - 3]) >> SIZE_INV_SHIFT
 	 */
-#define SIZE_INV_SHIFT 21
-#define SIZE_INV(s) (((1 << SIZE_INV_SHIFT) / (s << QUANTUM_2POW_MIN)) + 1)
+#define	SIZE_INV_SHIFT 21
+#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << QUANTUM_2POW_MIN)) + 1)
 	static const unsigned size_invs[] = {
 	    SIZE_INV(3),
 	    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
@@ -1718,32 +1984,73 @@
 	if (elm < run->regs_minelm)
 		run->regs_minelm = elm;
 	bit = regind - (elm << (SIZEOF_INT_2POW + 3));
-	assert((run->regs_mask[elm] & (1 << bit)) == 0);
-	run->regs_mask[elm] |= (1 << bit);
+	assert((run->regs_mask[elm] & (1U << bit)) == 0);
+	run->regs_mask[elm] |= (1U << bit);
 #undef SIZE_INV
 #undef SIZE_INV_SHIFT
 }
 
 static void
-arena_run_split(arena_t *arena, arena_run_t *run, size_t size)
+arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool zero)
 {
 	arena_chunk_t *chunk;
 	unsigned run_ind, map_offset, total_pages, need_pages, rem_pages;
 	unsigned i;
+	uint32_t pos_beg, pos_end;
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> pagesize_2pow);
 	total_pages = chunk->map[run_ind].npages;
 	need_pages = (size >> pagesize_2pow);
+	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
 	/* Split enough pages from the front of run to fit allocation size. */
 	map_offset = run_ind;
-	for (i = 0; i < need_pages; i++) {
+	pos_beg = chunk->map[map_offset].pos;
+	pos_end = chunk->map[map_offset + total_pages - 1].pos;
+	if (zero == false) {
+		for (i = 0; i < need_pages; i++) {
+			chunk->map[map_offset + i].npages = need_pages;
+			chunk->map[map_offset + i].pos = i;
+		}
+	} else {
+		/*
+		 * Handle first page specially, since we need to look for
+		 * POS_EMPTY rather than NPAGES_EMPTY.
+		 */
+		i = 0;
+		if (chunk->map[map_offset + i].pos != POS_EMPTY) {
+			memset((void *)((uintptr_t)chunk + ((map_offset + i) <<
+			    pagesize_2pow)), 0, pagesize);
+		}
 		chunk->map[map_offset + i].npages = need_pages;
 		chunk->map[map_offset + i].pos = i;
+
+		/* Handle central pages. */
+		for (i++; i < need_pages - 1; i++) {
+			if (chunk->map[map_offset + i].npages != NPAGES_EMPTY) {
+				memset((void *)((uintptr_t)chunk + ((map_offset
+				    + i) << pagesize_2pow)), 0, pagesize);
+			}
+			chunk->map[map_offset + i].npages = need_pages;
+			chunk->map[map_offset + i].pos = i;
+		}
+
+		/*
+		 * Handle last page specially, since we need to look for
+		 * POS_EMPTY rather than NPAGES_EMPTY.
+		 */
+		if (i < need_pages) {
+			if (chunk->map[map_offset + i].npages != POS_EMPTY) {
+				memset((void *)((uintptr_t)chunk + ((map_offset
+				    + i) << pagesize_2pow)), 0, pagesize);
+			}
+			chunk->map[map_offset + i].npages = need_pages;
+			chunk->map[map_offset + i].pos = i;
+		}
 	}
 
 	/* Keep track of trailing unused pages for later use. */
@@ -1751,9 +2058,9 @@
 		/* Update map for trailing pages. */
 		map_offset += need_pages;
 		chunk->map[map_offset].npages = rem_pages;
-		chunk->map[map_offset].pos = POS_FREE;
+		chunk->map[map_offset].pos = pos_beg;
 		chunk->map[map_offset + rem_pages - 1].npages = rem_pages;
-		chunk->map[map_offset + rem_pages - 1].pos = POS_FREE;
+		chunk->map[map_offset + rem_pages - 1].pos = pos_end;
 	}
 
 	chunk->pages_used += need_pages;
@@ -1770,6 +2077,8 @@
 
 		RB_INSERT(arena_chunk_tree_s, &arena->chunks, chunk);
 	} else {
+		unsigned i;
+
 		chunk = (arena_chunk_t *)chunk_alloc(chunksize);
 		if (chunk == NULL)
 			return (NULL);
@@ -1794,12 +2103,20 @@
 		/*
 		 * Initialize enough of the map to support one maximal free run.
 		 */
-		chunk->map[arena_chunk_header_npages].npages = chunk_npages -
-		    arena_chunk_header_npages;
-		chunk->map[arena_chunk_header_npages].pos = POS_FREE;
-		chunk->map[chunk_npages - 1].npages = chunk_npages -
-		    arena_chunk_header_npages;
-		chunk->map[chunk_npages - 1].pos = POS_FREE;
+		i = arena_chunk_header_npages;
+		chunk->map[i].npages = chunk_npages - arena_chunk_header_npages;
+		chunk->map[i].pos = POS_EMPTY;
+
+		/* Mark the free run's central pages as untouched. */
+		for (i++; i < chunk_npages - 1; i++)
+			chunk->map[i].npages = NPAGES_EMPTY;
+
+		/* Take care when (chunk_npages == 2). */
+		if (i < chunk_npages) {
+			chunk->map[i].npages = chunk_npages -
+			    arena_chunk_header_npages;
+			chunk->map[i].pos = POS_EMPTY;
+		}
 	}
 
 	return (chunk);
@@ -1833,7 +2150,7 @@
 }
 
 static arena_run_t *
-arena_run_alloc(arena_t *arena, size_t size)
+arena_run_alloc(arena_t *arena, size_t size, bool zero)
 {
 	arena_chunk_t *chunk;
 	arena_run_t *run;
@@ -1867,14 +2184,14 @@
 			    arena_chunk_header_npages);
 			for (i = chunk->min_frun_ind; i < chunk_npages;) {
 				mapelm = &chunk->map[i];
-				if (mapelm->pos == POS_FREE) {
+				if (mapelm->pos >= POS_EMPTY) {
 					if (mapelm->npages >= need_npages) {
 						run = (arena_run_t *)
 						    ((uintptr_t)chunk + (i <<
 						    pagesize_2pow));
 						/* Update page map. */
 						arena_run_split(arena, run,
-						    size);
+						    size, zero);
 						return (run);
 					}
 					if (mapelm->npages >
@@ -1908,7 +2225,7 @@
 	run = (arena_run_t *)((uintptr_t)chunk + (arena_chunk_header_npages <<

>>> TRUNCATED FOR MAIL (1000 lines) <<<