PERFORCE change 129713 for review
Peter Wemm
peter at FreeBSD.org
Wed Nov 28 11:09:09 PST 2007
http://perforce.freebsd.org/chv.cgi?CH=129713
Change 129713 by peter at peter_overcee on 2007/11/28 19:08:51
IFC @129711
Affected files ...
.. //depot/projects/hammer/ObsoleteFiles.inc#41 integrate
.. //depot/projects/hammer/UPDATING#111 integrate
.. //depot/projects/hammer/etc/defaults/periodic.conf#21 integrate
.. //depot/projects/hammer/etc/gss/mech#2 integrate
.. //depot/projects/hammer/gnu/usr.bin/groff/tmac/mdoc.local#29 integrate
.. //depot/projects/hammer/lib/libc/gen/Symbol.map#6 integrate
.. //depot/projects/hammer/lib/libc/stdlib/malloc.3#16 integrate
.. //depot/projects/hammer/lib/libc/stdlib/malloc.c#37 integrate
.. //depot/projects/hammer/lib/libgssapi/gss_acquire_cred.c#2 integrate
.. //depot/projects/hammer/lib/libkse/kse.map#2 integrate
.. //depot/projects/hammer/lib/libkse/sys/lock.c#2 integrate
.. //depot/projects/hammer/lib/libkse/sys/lock.h#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_cond.c#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_init.c#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_kern.c#2 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_mutex.c#3 integrate
.. //depot/projects/hammer/lib/libkse/thread/thr_rtld.c#2 integrate
.. //depot/projects/hammer/lib/libthr/pthread.map#14 integrate
.. //depot/projects/hammer/lib/libthr/thread/thr_mutex.c#36 integrate
.. //depot/projects/hammer/release/doc/en_US.ISO8859-1/relnotes/article.sgml#26 integrate
.. //depot/projects/hammer/release/doc/share/sgml/release.ent#22 integrate
.. //depot/projects/hammer/sbin/mdconfig/mdconfig.8#21 integrate
.. //depot/projects/hammer/sbin/mount/mount.8#24 integrate
.. //depot/projects/hammer/sbin/newfs/newfs.8#13 integrate
.. //depot/projects/hammer/sbin/newfs/newfs.c#16 integrate
.. //depot/projects/hammer/sbin/newfs/newfs.h#9 integrate
.. //depot/projects/hammer/sbin/sysctl/sysctl.8#15 integrate
.. //depot/projects/hammer/share/man/man4/agp.4#8 integrate
.. //depot/projects/hammer/share/man/man4/ichsmb.4#4 integrate
.. //depot/projects/hammer/share/man/man4/nfe.4#6 integrate
.. //depot/projects/hammer/share/man/man4/rl.4#18 integrate
.. //depot/projects/hammer/share/man/man9/Makefile#75 integrate
.. //depot/projects/hammer/share/man/man9/stack.9#1 branch
.. //depot/projects/hammer/share/misc/iso3166#8 integrate
.. //depot/projects/hammer/share/mk/sys.mk#27 integrate
.. //depot/projects/hammer/sys/amd64/amd64/busdma_machdep.c#46 integrate
.. //depot/projects/hammer/sys/amd64/conf/GENERIC#100 integrate
.. //depot/projects/hammer/sys/arm/arm/busdma_machdep.c#26 integrate
.. //depot/projects/hammer/sys/arm/include/atomic.h#18 integrate
.. //depot/projects/hammer/sys/conf/NOTES#129 integrate
.. //depot/projects/hammer/sys/conf/options#116 integrate
.. //depot/projects/hammer/sys/dev/an/if_an.c#38 integrate
.. //depot/projects/hammer/sys/dev/sound/pci/hda/hdac.c#12 integrate
.. //depot/projects/hammer/sys/dev/wpi/if_wpi.c#3 integrate
.. //depot/projects/hammer/sys/dev/wpi/if_wpireg.h#2 integrate
.. //depot/projects/hammer/sys/i386/conf/GENERIC#58 integrate
.. //depot/projects/hammer/sys/i386/conf/XBOX#6 integrate
.. //depot/projects/hammer/sys/i386/i386/busdma_machdep.c#35 integrate
.. //depot/projects/hammer/sys/ia64/ia64/busdma_machdep.c#22 integrate
.. //depot/projects/hammer/sys/ia64/include/atomic.h#6 integrate
.. //depot/projects/hammer/sys/kern/kern_mutex.c#51 integrate
.. //depot/projects/hammer/sys/kern/kern_rwlock.c#14 integrate
.. //depot/projects/hammer/sys/netinet/tcp_output.c#47 integrate
.. //depot/projects/hammer/sys/netinet/tcp_subr.c#79 integrate
.. //depot/projects/hammer/sys/powerpc/include/atomic.h#11 integrate
.. //depot/projects/hammer/sys/sparc64/conf/GENERIC#55 integrate
.. //depot/projects/hammer/sys/sun4v/conf/GENERIC#6 integrate
.. //depot/projects/hammer/sys/sys/signal.h#17 integrate
.. //depot/projects/hammer/usr.sbin/newsyslog/newsyslog.conf.5#6 integrate
Differences ...
==== //depot/projects/hammer/ObsoleteFiles.inc#41 (text+ko) ====
@@ -1,5 +1,5 @@
#
-# $FreeBSD: src/ObsoleteFiles.inc,v 1.120 2007/11/21 10:49:33 ru Exp $
+# $FreeBSD: src/ObsoleteFiles.inc,v 1.121 2007/11/27 13:58:25 brix Exp $
#
# This file lists old files (OLD_FILES), libraries (OLD_LIBS) and
# directories (OLD_DIRS) which should get removed at an update. Recently
@@ -3964,9 +3964,10 @@
# - usr/share/tmac/mm/se_locale
# - var/yp/Makefile
-# 20071119: shared library version bump
+# 20071120: shared library version bump
OLD_LIBS+=usr/lib/libasn1.so.8
OLD_LIBS+=usr/lib/libgssapi.so.8
+OLD_LIBS+=usr/lib/libgssapi_krb5.so.8
OLD_LIBS+=usr/lib/libhdb.so.8
OLD_LIBS+=usr/lib/libkadm5clnt.so.8
OLD_LIBS+=usr/lib/libkadm5srv.so.8
==== //depot/projects/hammer/UPDATING#111 (text+ko) ====
@@ -21,6 +21,10 @@
developers choose to disable these features on build machines
to maximize performance.
+20071128:
+ The ADAPTIVE_GIANT kernel option has been retired because its
+ functionality is the default now.
+
20071118:
The AT keyboard emulation of sunkbd(4) has been turned on
by default. In order to make the special symbols of the Sun
@@ -945,4 +949,4 @@
Contact Warner Losh if you have any questions about your use of
this document.
-$FreeBSD: src/UPDATING,v 1.512 2007/11/18 18:11:16 marius Exp $
+$FreeBSD: src/UPDATING,v 1.513 2007/11/28 13:04:11 matteo Exp $
==== //depot/projects/hammer/etc/defaults/periodic.conf#21 (text+ko) ====
@@ -13,7 +13,7 @@
# For a more detailed explanation of all the periodic.conf variables, please
# refer to the periodic.conf(5) manual page.
#
-# $FreeBSD: src/etc/defaults/periodic.conf,v 1.44 2007/05/29 06:22:13 dougb Exp $
+# $FreeBSD: src/etc/defaults/periodic.conf,v 1.45 2007/11/28 17:31:11 jhb Exp $
#
# What files override these defaults ?
@@ -45,7 +45,9 @@
daily_clean_tmps_enable="NO" # Delete stuff daily
daily_clean_tmps_dirs="/tmp" # Delete under here
daily_clean_tmps_days="3" # If not accessed for
-daily_clean_tmps_ignore=".X*-lock quota.user quota.group" # Don't delete these
+daily_clean_tmps_ignore=".X*-lock .X11-unix .ICE-unix .font-unix .XIM-unix"
+daily_clean_tmps_ignore="$daily_clean_tmps_ignore quota.user quota.group"
+ # Don't delete these
daily_clean_tmps_verbose="YES" # Mention files deleted
# 120.clean-preserve
==== //depot/projects/hammer/etc/gss/mech#2 (text+ko) ====
@@ -1,4 +1,4 @@
-# $FreeBSD: src/etc/gss/mech,v 1.1 2005/12/29 14:40:18 dfr Exp $
+# $FreeBSD: src/etc/gss/mech,v 1.2 2007/11/27 21:47:56 jhb Exp $
#
# Name OID Library name Kernel module
-kerberosv5 1.2.840.113554.1.2.2 /usr/lib/libgssapi_krb5.so.8 -
+kerberosv5 1.2.840.113554.1.2.2 /usr/lib/libgssapi_krb5.so.9 -
==== //depot/projects/hammer/gnu/usr.bin/groff/tmac/mdoc.local#29 (text+ko) ====
@@ -22,7 +22,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.\" $FreeBSD: src/gnu/usr.bin/groff/tmac/mdoc.local,v 1.61 2007/10/22 10:01:58 ru Exp $
+.\" $FreeBSD: src/gnu/usr.bin/groff/tmac/mdoc.local,v 1.62 2007/11/27 10:00:33 jkoshy Exp $
.\"
.\" %beginstrip%
.
@@ -52,7 +52,7 @@
.ds doc-str-Lb-libmd Message Digest (MD4, MD5, etc.) Support Library (libmd, \-lmd)
.ds doc-str-Lb-libmemstat Kernel Memory Allocator Statistics Library (libmemstat, \-lmemstat)
.ds doc-str-Lb-libnetgraph Netgraph User Library (libnetgraph, \-lnetgraph)
-.ds doc-str-Lb-libpmc Performance Monitoring Counters API (libpmc, \-lpmc)
+.ds doc-str-Lb-libpmc Performance Monitoring Counters Interface Library (libpmc, \-lpmc)
.ds doc-str-Lb-librpcsvc RPC Service Library (librpcsvc, \-lrpcsvc)
.ds doc-str-Lb-libsdp Bluetooth Service Discovery Protocol User Library (libsdp, \-lsdp)
.ds doc-str-Lb-libthr 1:1 Threading Library (libthr, \-lthr)
==== //depot/projects/hammer/lib/libc/gen/Symbol.map#6 (text) ====
@@ -1,5 +1,5 @@
/*
- * $FreeBSD: src/lib/libc/gen/Symbol.map,v 1.6 2007/05/31 13:01:33 deischen Exp $
+ * $FreeBSD: src/lib/libc/gen/Symbol.map,v 1.7 2007/11/27 16:22:21 jasone Exp $
*/
FBSD_1.0 {
@@ -378,6 +378,7 @@
_pthread_kill;
_pthread_main_np;
_pthread_mutex_destroy;
+ _pthread_mutex_init_calloc_cb;
_pthread_mutex_init;
_pthread_mutex_lock;
_pthread_mutex_trylock;
==== //depot/projects/hammer/lib/libc/stdlib/malloc.3#16 (text+ko) ====
@@ -30,9 +30,9 @@
.\" SUCH DAMAGE.
.\"
.\" @(#)malloc.3 8.1 (Berkeley) 6/4/93
-.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.73 2007/06/15 22:32:33 jasone Exp $
+.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.74 2007/11/27 03:18:26 jasone Exp $
.\"
-.Dd June 15, 2007
+.Dd October 1, 2007
.Dt MALLOC 3
.Os
.Sh NAME
@@ -177,6 +177,19 @@
The process will call
.Xr abort 3
in these cases.
+.It B
+Increase/decrease the per-arena lock contention threshold at which a thread is
+randomly re-assigned to an arena.
+This dynamic load balancing tends to push threads away from highly contended
+arenas, which avoids worst case contention scenarios in which threads
+disproportionately utilize arenas.
+However, due to the highly dynamic load that applications may place on the
+allocator, it is impossible for the allocator to know in advance how sensitive
+it should be to contention over arenas.
+Therefore, some applications may benefit from increasing or decreasing this
+threshold parameter.
+This option is not available for some configurations (non-PIC).
+This option can be specified multiple times.
.It H
Use
.Xr madvise 2
@@ -204,6 +217,18 @@
Increase/decrease the virtual memory chunk size by a factor of two.
The default chunk size is 1 MB.
This option can be specified multiple times.
+.It L
+Increase/decrease the per-arena number of slots for lazy deallocation.
+Lazy deallocation can decrease lock contention, especially for programs that use
+the producer/consumer model.
+The default is 256 slots per arena (so
+.Ev MALLOC_OPTIONS=lllllllll
+will disable lazy deallocation), but note that due to algorithmic details, the
+cache is typically flushed well before it completely fills.
+This option has no impact unless there are multiple CPUs, and lazy deallocation
+does not activate unless the program uses multiple threads.
+This option is not available for some configurations (non-PIC).
+This option can be specified multiple times.
.It N
Increase/decrease the number of arenas by a factor of two.
The default number of arenas is four times the number of CPUs, or one if there
==== //depot/projects/hammer/lib/libc/stdlib/malloc.c#37 (text+ko) ====
@@ -98,14 +98,14 @@
* defaults the A and J runtime options to off. These settings are appropriate
* for production systems.
*/
-/* #define MALLOC_PRODUCTION */
+/* #define MALLOC_PRODUCTION */
#ifndef MALLOC_PRODUCTION
# define MALLOC_DEBUG
#endif
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.147 2007/06/15 22:00:16 jasone Exp $");
+__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.152 2007/11/28 00:17:34 jasone Exp $");
#include "libc_private.h"
#ifdef MALLOC_DEBUG
@@ -171,6 +171,7 @@
# define QUANTUM_2POW_MIN 4
# define SIZEOF_PTR_2POW 2
# define USE_BRK
+# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __ia64__
# define QUANTUM_2POW_MIN 4
@@ -189,6 +190,7 @@
#ifdef __amd64__
# define QUANTUM_2POW_MIN 4
# define SIZEOF_PTR_2POW 3
+# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __arm__
# define QUANTUM_2POW_MIN 3
@@ -202,9 +204,9 @@
# define USE_BRK
#endif
-#define SIZEOF_PTR (1 << SIZEOF_PTR_2POW)
+#define SIZEOF_PTR (1U << SIZEOF_PTR_2POW)
-/* sizeof(int) == (1 << SIZEOF_INT_2POW). */
+/* sizeof(int) == (1U << SIZEOF_INT_2POW). */
#ifndef SIZEOF_INT_2POW
# define SIZEOF_INT_2POW 2
#endif
@@ -226,7 +228,7 @@
* negatively affect performance.
*/
#define CACHELINE_2POW 6
-#define CACHELINE ((size_t)(1 << CACHELINE_2POW))
+#define CACHELINE ((size_t)(1U << CACHELINE_2POW))
/* Smallest size class to support. */
#define TINY_MIN_2POW 1
@@ -237,7 +239,7 @@
* power of 2.
*/
#define SMALL_MAX_2POW_DEFAULT 9
-#define SMALL_MAX_DEFAULT (1 << SMALL_MAX_2POW_DEFAULT)
+#define SMALL_MAX_DEFAULT (1U << SMALL_MAX_2POW_DEFAULT)
/*
* Maximum desired run header overhead. Runs are sized as small as possible
@@ -252,18 +254,69 @@
* RUN_MAX_OVRHD_RELAX specifies the maximum number of bits per region of
* overhead for which RUN_MAX_OVRHD is relaxed.
*/
-#define RUN_MAX_OVRHD 0.015
-#define RUN_MAX_OVRHD_RELAX 1.5
+#define RUN_MAX_OVRHD 0.015
+#define RUN_MAX_OVRHD_RELAX 1.5
/* Put a cap on small object run size. This overrides RUN_MAX_OVRHD. */
-#define RUN_MAX_SMALL_2POW 15
-#define RUN_MAX_SMALL (1 << RUN_MAX_SMALL_2POW)
+#define RUN_MAX_SMALL_2POW 15
+#define RUN_MAX_SMALL (1U << RUN_MAX_SMALL_2POW)
+
+/* Default size of each arena's lazy free cache. */
+#define LAZY_FREE_2POW_DEFAULT 8
+/*
+ * Number of pseudo-random probes to conduct before considering the cache to be
+ * overly full. It takes on average n probes to detect fullness of (n-1)/n.
+ * However, we are effectively doing multiple non-independent trials (each
+ * deallocation is a trial), so the actual average threshold for clearing the
+ * cache is somewhat lower.
+ */
+#define LAZY_FREE_NPROBES 5
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU. If no such instruction is defined
+ * above, make CPU_SPINWAIT a no-op.
+ */
+#ifndef CPU_SPINWAIT
+# define CPU_SPINWAIT
+#endif
+
+/*
+ * Adaptive spinning must eventually switch to blocking, in order to avoid the
+ * potential for priority inversion deadlock. Backing off past a certain point
+ * can actually waste time.
+ */
+#define SPIN_LIMIT_2POW 11
+
+/*
+ * Conversion from spinning to blocking is expensive; we use (1U <<
+ * BLOCK_COST_2POW) to estimate how many more times costly blocking is than
+ * worst-case spinning.
+ */
+#define BLOCK_COST_2POW 4
+
+/*
+ * We use an exponential moving average to track recent lock contention, where
+ * the size of the history window is N, and alpha=2/(N+1).
+ *
+ * Due to integer math rounding, very small values here can cause substantial
+ * degradation in accuracy, thus making the moving average decay faster than it
+ * would with precise calculation.
+ */
+#define BALANCE_ALPHA_INV_2POW 9
+
+/*
+ * Threshold value for the exponential moving contention average at which to
+ * re-assign a thread.
+ */
+#define BALANCE_THRESHOLD_DEFAULT (1U << (SPIN_LIMIT_2POW-4))
/******************************************************************************/
/*
- * Mutexes based on spinlocks. We can't use normal pthread mutexes, because
- * they require malloc()ed memory.
+ * Mutexes based on spinlocks. We can't use normal pthread spinlocks in all
+ * places, because they require malloc()ed memory, which causes bootstrapping
+ * issues in some cases.
*/
typedef struct {
spinlock_t lock;
@@ -319,6 +372,11 @@
size_t allocated_large;
uint64_t nmalloc_large;
uint64_t ndalloc_large;
+
+#ifndef NO_TLS
+ /* Number of times this arena reassigned a thread due to contention. */
+ uint64_t nbalance;
+#endif
};
typedef struct chunk_stats_s chunk_stats_t;
@@ -374,17 +432,27 @@
typedef struct arena_chunk_map_s arena_chunk_map_t;
struct arena_chunk_map_s {
- /* Number of pages in run. */
+ /*
+ * Number of pages in run. For a free run that has never been touched,
+ * this is NPAGES_EMPTY for the central pages, which allows us to avoid
+ * zero-filling untouched pages for calloc().
+ */
+#define NPAGES_EMPTY ((uint32_t)0x0U)
uint32_t npages;
/*
- * Position within run. For a free run, this is POS_FREE for the first
- * and last pages. The POS_FREE special value makes it possible to
- * quickly coalesce free runs.
+ * Position within run. For a free run, this is POS_EMPTY/POS_FREE for
+ * the first and last pages. The special values make it possible to
+ * quickly coalesce free runs. POS_EMPTY indicates that the run has
+ * never been touched, which allows us to avoid zero-filling untouched
+ * pages for calloc().
*
* This is the limiting factor for chunksize; there can be at most 2^31
* pages in a run.
+ *
+ * POS_EMPTY is assumed by arena_run_dalloc() to be less than POS_FREE.
*/
-#define POS_FREE ((uint32_t)0xffffffffU)
+#define POS_EMPTY ((uint32_t)0xfffffffeU)
+#define POS_FREE ((uint32_t)0xffffffffU)
uint32_t pos;
};
@@ -496,8 +564,8 @@
# define ARENA_MAGIC 0x947d3d24
#endif
- /* All operations on this arena require that mtx be locked. */
- malloc_mutex_t mtx;
+ /* All operations on this arena require that lock be locked. */
+ pthread_mutex_t lock;
#ifdef MALLOC_STATS
arena_stats_t stats;
@@ -517,7 +585,23 @@
* order to avoid interactions between multiple threads that could make
* a single spare inadequate.
*/
- arena_chunk_t *spare;
+ arena_chunk_t *spare;
+
+#ifndef NO_TLS
+ /*
+ * The arena load balancing machinery needs to keep track of how much
+ * lock contention there is. This value is exponentially averaged.
+ */
+ uint32_t contention;
+
+ /*
+ * Deallocation of small objects can be lazy, in which case free_cache
+ * stores pointers to those objects that have not yet been deallocated.
+ * In order to avoid lock contention, slots are chosen randomly. Empty
+ * slots contain NULL.
+ */
+ void **free_cache;
+#endif
/*
* bins is used to store rings of free regions of the following sizes,
@@ -650,9 +734,9 @@
static arena_t **arenas;
static unsigned narenas;
#ifndef NO_TLS
-static unsigned next_arena;
+static unsigned narenas_2pow;
#endif
-static malloc_mutex_t arenas_mtx; /* Protects arenas initialization. */
+static pthread_mutex_t arenas_lock; /* Protects arenas initialization. */
#ifndef NO_TLS
/*
@@ -681,6 +765,10 @@
static bool opt_junk = false;
#endif
static bool opt_hint = false;
+#ifndef NO_TLS
+static int opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT;
+static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
+#endif
static bool opt_print_stats = false;
static size_t opt_quantum_2pow = QUANTUM_2POW_MIN;
static size_t opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
@@ -689,7 +777,7 @@
static bool opt_sysv = false;
static bool opt_xmalloc = false;
static bool opt_zero = false;
-static int32_t opt_narenas_lshift = 0;
+static int opt_narenas_lshift = 0;
typedef struct {
void *p;
@@ -709,6 +797,7 @@
*/
static void malloc_mutex_init(malloc_mutex_t *a_mutex);
+static bool malloc_spin_init(pthread_mutex_t *lock);
static void wrtmessage(const char *p1, const char *p2, const char *p3,
const char *p4);
#ifdef MALLOC_STATS
@@ -717,6 +806,7 @@
static char *umax2s(uintmax_t x, char *s);
static bool base_pages_alloc(size_t minsize);
static void *base_alloc(size_t size);
+static void *base_calloc(size_t number, size_t size);
static chunk_node_t *base_chunk_node_alloc(void);
static void base_chunk_node_dealloc(chunk_node_t *node);
#ifdef MALLOC_STATS
@@ -729,15 +819,16 @@
#ifndef NO_TLS
static arena_t *choose_arena_hard(void);
#endif
-static void arena_run_split(arena_t *arena, arena_run_t *run, size_t size);
+static void arena_run_split(arena_t *arena, arena_run_t *run, size_t size,
+ bool zero);
static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
static void arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
-static arena_run_t *arena_run_alloc(arena_t *arena, size_t size);
+static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool zero);
static void arena_run_dalloc(arena_t *arena, arena_run_t *run, size_t size);
static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
-static void *arena_malloc(arena_t *arena, size_t size);
+static void *arena_malloc(arena_t *arena, size_t size, bool zero);
static void *arena_palloc(arena_t *arena, size_t alignment, size_t size,
size_t alloc_size);
static size_t arena_salloc(const void *ptr);
@@ -745,7 +836,7 @@
static void arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr);
static bool arena_new(arena_t *arena);
static arena_t *arenas_extend(unsigned ind);
-static void *huge_malloc(size_t size);
+static void *huge_malloc(size_t size, bool zero);
static void *huge_palloc(size_t alignment, size_t size);
static void *huge_ralloc(void *ptr, size_t size, size_t oldsize);
static void huge_dalloc(void *ptr);
@@ -763,7 +854,9 @@
*/
/******************************************************************************/
/*
- * Begin mutex.
+ * Begin mutex. We can't use normal pthread mutexes in all places, because
+ * they require malloc()ed memory, which causes bootstrapping issues in some
+ * cases.
*/
static void
@@ -795,6 +888,86 @@
*/
/******************************************************************************/
/*
+ * Begin spin lock. Spin locks here are actually adaptive mutexes that block
+ * after a period of spinning, because unbounded spinning would allow for
+ * priority inversion.
+ */
+
+/*
+ * We use an unpublished interface to initialize pthread mutexes with an
+ * allocation callback, in order to avoid infinite recursion.
+ */
+int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
+ void *(calloc_cb)(size_t, size_t));
+
+__weak_reference(_pthread_mutex_init_calloc_cb_stub,
+ _pthread_mutex_init_calloc_cb);
+
+int
+_pthread_mutex_init_calloc_cb_stub(pthread_mutex_t *mutex,
+ void *(calloc_cb)(size_t, size_t))
+{
+
+ return (0);
+}
+
+static bool
+malloc_spin_init(pthread_mutex_t *lock)
+{
+
+ if (_pthread_mutex_init_calloc_cb(lock, base_calloc) != 0)
+ return (true);
+
+ return (false);
+}
+
+static inline unsigned
+malloc_spin_lock(pthread_mutex_t *lock)
+{
+ unsigned ret = 0;
+
+ if (__isthreaded) {
+ if (_pthread_mutex_trylock(lock) != 0) {
+ unsigned i;
+ volatile unsigned j;
+
+ /* Exponentially back off. */
+ for (i = 1; i <= SPIN_LIMIT_2POW; i++) {
+ for (j = 0; j < (1U << i); j++)
+ ret++;
+
+ CPU_SPINWAIT;
+ if (_pthread_mutex_trylock(lock) == 0)
+ return (ret);
+ }
+
+ /*
+ * Spinning failed. Block until the lock becomes
+ * available, in order to avoid indefinite priority
+ * inversion.
+ */
+ _pthread_mutex_lock(lock);
+ assert((ret << BLOCK_COST_2POW) != 0);
+ return (ret << BLOCK_COST_2POW);
+ }
+ }
+
+ return (ret);
+}
+
+static inline void
+malloc_spin_unlock(pthread_mutex_t *lock)
+{
+
+ if (__isthreaded)
+ _pthread_mutex_unlock(lock);
+}
+
+/*
+ * End spin lock.
+ */
+/******************************************************************************/
+/*
* Begin Utility functions/macros.
*/
@@ -840,6 +1013,63 @@
return (x);
}
+#ifndef NO_TLS
+/*
+ * Use a simple linear congruential pseudo-random number generator:
+ *
+ * prn(y) = (a*x + c) % m
+ *
+ * where the following constants ensure maximal period:
+ *
+ * a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4.
+ * c == Odd number (relatively prime to 2^n).
+ * m == 2^32
+ *
+ * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
+ *
+ * This choice of m has the disadvantage that the quality of the bits is
+ * proportional to bit position. For example. the lowest bit has a cycle of 2,
+ * the next has a cycle of 4, etc. For this reason, we prefer to use the upper
+ * bits.
+ */
+#define PRN_DEFINE(suffix, var, a, c) \
+static inline void \
+sprn_##suffix(uint32_t seed) \
+{ \
+ var = seed; \
+} \
+ \
+static inline uint32_t \
+prn_##suffix(uint32_t lg_range) \
+{ \
+ uint32_t ret, x; \
+ \
+ assert(lg_range > 0); \
+ assert(lg_range <= 32); \
+ \
+ x = (var * (a)) + (c); \
+ var = x; \
+ ret = x >> (32 - lg_range); \
+ \
+ return (ret); \
+}
+#define SPRN(suffix, seed) sprn_##suffix(seed)
+#define PRN(suffix, lg_range) prn_##suffix(lg_range)
+
+/*
+ * Define PRNGs, one for each purpose, in order to avoid auto-correlation
+ * problems.
+ */
+
+/* Define the per-thread PRNG used for lazy deallocation. */
+static __thread uint32_t lazy_free_x;
+PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347)
+
+/* Define the PRNG used for arena assignment. */
+static __thread uint32_t balance_x;
+PRN_DEFINE(balance, balance_x, 1297, 1301);
+#endif
+
static void
wrtmessage(const char *p1, const char *p2, const char *p3, const char *p4)
{
@@ -876,7 +1106,7 @@
* integer printing functionality, so that malloc_printf() use can be limited to
* MALLOC_STATS code.
*/
-#define UMAX2S_BUFSIZE 21
+#define UMAX2S_BUFSIZE 21
static char *
umax2s(uintmax_t x, char *s)
{
@@ -995,6 +1225,17 @@
return (ret);
}
+static void *
+base_calloc(size_t number, size_t size)
+{
+ void *ret;
+
+ ret = base_alloc(number * size);
+ memset(ret, 0, number * size);
+
+ return (ret);
+}
+
static chunk_node_t *
base_chunk_node_alloc(void)
{
@@ -1202,6 +1443,11 @@
&& (uintptr_t)chunk < (uintptr_t)brk_max) {
/* Re-use a previously freed brk chunk. */
ret = chunk;
+ /*
+ * Maintain invariant that all newly allocated
+ * chunks are untouched or zero-filled.
+ */
+ memset(ret, 0, size);
goto RETURN;
}
#endif
@@ -1444,8 +1690,10 @@
}
ret = arenas_map;
- if (ret == NULL)
+ if (ret == NULL) {
ret = choose_arena_hard();
+ assert(ret != NULL);
+ }
#else
if (__isthreaded) {
unsigned long ind;
@@ -1479,12 +1727,12 @@
* Avoid races with another thread that may have already
* initialized arenas[ind].
*/
- malloc_mutex_lock(&arenas_mtx);
+ malloc_spin_lock(&arenas_lock);
if (arenas[ind] == NULL)
ret = arenas_extend((unsigned)ind);
else
ret = arenas[ind];
- malloc_mutex_unlock(&arenas_mtx);
+ malloc_spin_unlock(&arenas_lock);
}
} else
ret = arenas[0];
@@ -1506,21 +1754,39 @@
assert(__isthreaded);
- /* Assign one of the arenas to this thread, in a round-robin fashion. */
- malloc_mutex_lock(&arenas_mtx);
- ret = arenas[next_arena];
- if (ret == NULL)
- ret = arenas_extend(next_arena);
- if (ret == NULL) {
- /*
- * Make sure that this function never returns NULL, so that
- * choose_arena() doesn't have to check for a NULL return
- * value.
- */
+ /*
+ * Seed the PRNG used for lazy deallocation. Since seeding only occurs
+ * on the first allocation by a thread, it is possible for a thread to
+ * deallocate before seeding. This is not a critical issue though,
+ * since it is extremely unusual for an application to to use threads
+ * that deallocate but *never* allocate, and because even if seeding
+ * never occurs for multiple threads, they will tend to drift apart
+ * unless some aspect of the application forces deallocation
+ * synchronization.
+ */
+ SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self()));
+
+ /*
+ * Seed the PRNG used for arena load balancing. We can get away with
+ * using the same seed here as for the lazy_free PRNG without
+ * introducing autocorrelation because the PRNG parameters are
+ * distinct.
+ */
+ SPRN(balance, (uint32_t)(uintptr_t)(_pthread_self()));
+
+ if (narenas > 1) {
+ unsigned ind;
+
+ ind = PRN(balance, narenas_2pow);
+ if ((ret = arenas[ind]) == NULL) {
+ malloc_spin_lock(&arenas_lock);
+ if ((ret = arenas[ind]) == NULL)
+ ret = arenas_extend(ind);
+ malloc_spin_unlock(&arenas_lock);
+ }
+ } else
ret = arenas[0];
- }
- next_arena = (next_arena + 1) % narenas;
- malloc_mutex_unlock(&arenas_mtx);
+
arenas_map = ret;
return (ret);
@@ -1588,7 +1854,7 @@
+ (bin->reg_size * regind));
/* Clear bit. */
- mask ^= (1 << bit);
+ mask ^= (1U << bit);
run->regs_mask[i] = mask;
return (ret);
@@ -1605,7 +1871,7 @@
+ (bin->reg_size * regind));
/* Clear bit. */
- mask ^= (1 << bit);
+ mask ^= (1U << bit);
run->regs_mask[i] = mask;
/*
@@ -1635,8 +1901,8 @@
*
* (X * size_invs[(D >> QUANTUM_2POW_MIN) - 3]) >> SIZE_INV_SHIFT
*/
-#define SIZE_INV_SHIFT 21
-#define SIZE_INV(s) (((1 << SIZE_INV_SHIFT) / (s << QUANTUM_2POW_MIN)) + 1)
+#define SIZE_INV_SHIFT 21
+#define SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << QUANTUM_2POW_MIN)) + 1)
static const unsigned size_invs[] = {
SIZE_INV(3),
SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
@@ -1718,32 +1984,73 @@
if (elm < run->regs_minelm)
run->regs_minelm = elm;
bit = regind - (elm << (SIZEOF_INT_2POW + 3));
- assert((run->regs_mask[elm] & (1 << bit)) == 0);
- run->regs_mask[elm] |= (1 << bit);
+ assert((run->regs_mask[elm] & (1U << bit)) == 0);
+ run->regs_mask[elm] |= (1U << bit);
#undef SIZE_INV
#undef SIZE_INV_SHIFT
}
static void
-arena_run_split(arena_t *arena, arena_run_t *run, size_t size)
+arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool zero)
{
arena_chunk_t *chunk;
unsigned run_ind, map_offset, total_pages, need_pages, rem_pages;
unsigned i;
+ uint32_t pos_beg, pos_end;
chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
>> pagesize_2pow);
total_pages = chunk->map[run_ind].npages;
need_pages = (size >> pagesize_2pow);
+ assert(need_pages > 0);
assert(need_pages <= total_pages);
rem_pages = total_pages - need_pages;
/* Split enough pages from the front of run to fit allocation size. */
map_offset = run_ind;
- for (i = 0; i < need_pages; i++) {
+ pos_beg = chunk->map[map_offset].pos;
+ pos_end = chunk->map[map_offset + total_pages - 1].pos;
+ if (zero == false) {
+ for (i = 0; i < need_pages; i++) {
+ chunk->map[map_offset + i].npages = need_pages;
+ chunk->map[map_offset + i].pos = i;
+ }
+ } else {
+ /*
+ * Handle first page specially, since we need to look for
+ * POS_EMPTY rather than NPAGES_EMPTY.
+ */
+ i = 0;
+ if (chunk->map[map_offset + i].pos != POS_EMPTY) {
+ memset((void *)((uintptr_t)chunk + ((map_offset + i) <<
+ pagesize_2pow)), 0, pagesize);
+ }
chunk->map[map_offset + i].npages = need_pages;
chunk->map[map_offset + i].pos = i;
+
+ /* Handle central pages. */
+ for (i++; i < need_pages - 1; i++) {
+ if (chunk->map[map_offset + i].npages != NPAGES_EMPTY) {
+ memset((void *)((uintptr_t)chunk + ((map_offset
+ + i) << pagesize_2pow)), 0, pagesize);
+ }
+ chunk->map[map_offset + i].npages = need_pages;
+ chunk->map[map_offset + i].pos = i;
+ }
+
+ /*
+ * Handle last page specially, since we need to look for
+ * POS_EMPTY rather than NPAGES_EMPTY.
+ */
+ if (i < need_pages) {
+ if (chunk->map[map_offset + i].npages != POS_EMPTY) {
+ memset((void *)((uintptr_t)chunk + ((map_offset
+ + i) << pagesize_2pow)), 0, pagesize);
+ }
+ chunk->map[map_offset + i].npages = need_pages;
+ chunk->map[map_offset + i].pos = i;
+ }
}
/* Keep track of trailing unused pages for later use. */
@@ -1751,9 +2058,9 @@
/* Update map for trailing pages. */
map_offset += need_pages;
chunk->map[map_offset].npages = rem_pages;
- chunk->map[map_offset].pos = POS_FREE;
+ chunk->map[map_offset].pos = pos_beg;
chunk->map[map_offset + rem_pages - 1].npages = rem_pages;
- chunk->map[map_offset + rem_pages - 1].pos = POS_FREE;
+ chunk->map[map_offset + rem_pages - 1].pos = pos_end;
}
chunk->pages_used += need_pages;
@@ -1770,6 +2077,8 @@
RB_INSERT(arena_chunk_tree_s, &arena->chunks, chunk);
} else {
+ unsigned i;
+
chunk = (arena_chunk_t *)chunk_alloc(chunksize);
if (chunk == NULL)
return (NULL);
@@ -1794,12 +2103,20 @@
/*
* Initialize enough of the map to support one maximal free run.
*/
- chunk->map[arena_chunk_header_npages].npages = chunk_npages -
- arena_chunk_header_npages;
- chunk->map[arena_chunk_header_npages].pos = POS_FREE;
- chunk->map[chunk_npages - 1].npages = chunk_npages -
- arena_chunk_header_npages;
- chunk->map[chunk_npages - 1].pos = POS_FREE;
+ i = arena_chunk_header_npages;
+ chunk->map[i].npages = chunk_npages - arena_chunk_header_npages;
+ chunk->map[i].pos = POS_EMPTY;
+
+ /* Mark the free run's central pages as untouched. */
+ for (i++; i < chunk_npages - 1; i++)
+ chunk->map[i].npages = NPAGES_EMPTY;
+
+ /* Take care when (chunk_npages == 2). */
+ if (i < chunk_npages) {
+ chunk->map[i].npages = chunk_npages -
+ arena_chunk_header_npages;
+ chunk->map[i].pos = POS_EMPTY;
+ }
}
return (chunk);
@@ -1833,7 +2150,7 @@
}
static arena_run_t *
-arena_run_alloc(arena_t *arena, size_t size)
+arena_run_alloc(arena_t *arena, size_t size, bool zero)
{
arena_chunk_t *chunk;
arena_run_t *run;
@@ -1867,14 +2184,14 @@
arena_chunk_header_npages);
for (i = chunk->min_frun_ind; i < chunk_npages;) {
mapelm = &chunk->map[i];
- if (mapelm->pos == POS_FREE) {
+ if (mapelm->pos >= POS_EMPTY) {
if (mapelm->npages >= need_npages) {
run = (arena_run_t *)
((uintptr_t)chunk + (i <<
pagesize_2pow));
/* Update page map. */
arena_run_split(arena, run,
- size);
+ size, zero);
return (run);
}
if (mapelm->npages >
@@ -1908,7 +2225,7 @@
run = (arena_run_t *)((uintptr_t)chunk + (arena_chunk_header_npages <<
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list