PERFORCE change 149020 for review

Konrad Jankowski konrad at FreeBSD.org
Mon Sep 1 21:08:04 UTC 2008


http://perforce.freebsd.org/chv.cgi?CH=149020

Change 149020 by konrad at vspredator on 2008/09/01 21:07:06

	ifc

Affected files ...

.. //depot/projects/soc2008/konrad_collation/libc/include/libc_private.h#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/locale/collate.c#9 edit
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/Makefile.inc#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/Symbol.map#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.3#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.c#6 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/ptsname.3#1 branch
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/ptsname.c#1 branch
.. //depot/projects/soc2008/konrad_collation/libc/sys/Makefile.inc#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/Symbol.map#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/execve.2#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/getrlimit.2#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/posix_openpt.2#1 branch
.. //depot/projects/soc2008/konrad_collation/libc/sys/wait.2#5 integrate

Differences ...

==== //depot/projects/soc2008/konrad_collation/libc/include/libc_private.h#5 (text+ko) ====

@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.19 2008/06/23 05:22:06 ed Exp $
+ * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.20 2008/08/27 02:00:53 jasone Exp $
  *
  * Private definitions for libc, libc_r and libpthread.
  *
@@ -158,6 +158,12 @@
 extern const char *__progname;
 
 /*
+ * This function is used by the threading libraries to notify malloc that a
+ * thread is exiting.
+ */
+void _malloc_thread_cleanup(void);
+
+/*
  * These functions are used by the threading libraries in order to protect
  * malloc across fork().
  */

==== //depot/projects/soc2008/konrad_collation/libc/locale/collate.c#9 (text+ko) ====

@@ -128,7 +128,7 @@
 		}
 #if _BYTE_ORDER == _LITTLE_ENDIAN
 		for(z = 0; z < info.directive_count; z++) {
-			info.undef_pri[z] = ntohl(info.undef_pri[z]);
+			info.undef_pri.pri[z] = ntohl(info.undef_pri.pri[z]);
 			info.subst_count[z] = ntohl(info.subst_count[z]);
 		}
 		info.chain_count = ntohl(info.chain_count);
@@ -220,10 +220,11 @@
 		}
 	{
 		struct __collate_st_chain_pri *p = TMP->__chain_pri_table;
+
 		for(i = chains; i-- > 0; p++) {
 			wntohl(p->str, STR_LEN);
 			for(z = 0; z < info.directive_count; z++)
-				p->pri[z] = ntohl(p->pri[z]);
+				p->pri.pri[z] = ntohl(p->pri.pri[z]);
 		}
 	}
 	if (info.large_pri_count > 0) {
@@ -239,7 +240,7 @@
 	(void)strcpy(TMP->__encoding, encoding);
 	(void)memcpy(&TMP->__info, &info, sizeof(info));
 	__collate_data = TMP;
-	
+
 	__collate_load_error = (info.subst_count[0] > 0 ||
 	    info.subst_count[1] > 0);
 	__collate_load_error = 0;
@@ -328,6 +329,7 @@
 		s++;
 	}
 	dest_str[len] = 0;
+
 	return (dest_str);
 }
 
@@ -357,9 +359,11 @@
 		else
 			high = next - 1;
 	}
+
 	return NULL;
 }
 
+/* XXX maybe just return struct __collate_st_char_pri? */
 static struct __collate_st_large_char_pri *
 largesearch(const wchar_t key)
 {
@@ -381,15 +385,21 @@
 		else
 			high = next - 1;
 	}
+
 	return NULL;
 }
 
+#if 0
 void
 __collate_lookup(const wchar_t *t, int *len, int *prim, int *sec)
 {
 	struct __collate_st_chain_pri *p2;
 	int l;
 
+	/*
+	 * I didn't fix this function, as it is anyway uselesess in multiple
+	 * weight collation.
+	 */
 	*len = 1;
 	*prim = *sec = 0;
 	p2 = chainsearch(t, &l);
@@ -417,38 +427,52 @@
 	*prim = (l = __collate_info->undef_pri[0]) >= 0 ? l : *t - l;
 	*sec = (l = __collate_info->undef_pri[1]) >= 0 ? l : *t - l;
 }
+#endif
 
-void
-__collate_lookup_which(const wchar_t *t, int *len, int *pri, int which)
+struct __collate_st_char_pri *
+__collate_lookup(const wchar_t *t, int *len)
 {
 	struct __collate_st_chain_pri *p2;
-	int p, l;
+	struct __collate_st_large_char_pri *match;
+	int l;
 
 	*len = 1;
-	*pri = 0;
 	p2 = chainsearch(t, &l);
 	if (p2) {
+		fprintf(stderr, "vsdebug: chainsearch succeeded");
+		*len = l;
+
+		return &p2->pri;
+#if 0
+		int p;
+
 		p = p2->pri[which];
 		/* use the chain if pri >= 0 */
 		if (p >= 0) {
 			*len = l;
 			*pri = p;
+
 			return;
 		}
+#endif
 	}
 	if (*t <= UCHAR_MAX) {
+#if 0
 		*pri = __collate_char_pri_table[*t].pri[which];
-		return;
+#endif
+		return &__collate_char_pri_table[*t];
 	}
 	if (__collate_info->large_pri_count > 0) {
-		struct __collate_st_large_char_pri *match;
+
 		match = largesearch(*t);
 		if (match) {
-			*pri = match->pri.pri[which];
-			return;
+			return &match->pri;
 		}
 	}
+#if 0
 	*pri = (l = __collate_info->undef_pri[which]) >= 0 ? l : *t - l;
+#endif
+	return &__collate_info->undef_pri;
 }
 
 wchar_t *
@@ -482,6 +506,7 @@
 	if ((wcs = (wchar_t *)malloc(len * sizeof(wchar_t))) == NULL)
 		__collate_err(EX_OSERR, __func__);
 	wcscpy(wcs, s);
+
 	return (wcs);
 }
 
@@ -722,9 +747,10 @@
 	if (__collate_info->chain_count > 0) {
 		struct __collate_st_chain_pri *match;
 		int ll;
+
 		match = chainsearch(wname, &ll);
 		if (match) {
-			e = match->pri[0];
+			e = match->pri.pri[0];
 			if (e == 0)
 				return IGNORE_EQUIV_CLASS;
 			return e < 0 ? -e : e;
@@ -786,7 +812,8 @@
 	}
 	*wp = 0;
 	if (len > 1 && (ch = chainsearch(buf, &i)) != NULL) {
-		int e = ch->pri[0];
+		int e = ch->pri.pri[0];
+
 		if (e < 0)
 			e = -e;
 		if (e == equiv_class)

==== //depot/projects/soc2008/konrad_collation/libc/stdlib/Makefile.inc#5 (text+ko) ====

@@ -1,14 +1,14 @@
 #	from @(#)Makefile.inc	8.3 (Berkeley) 2/4/95
-# $FreeBSD: src/lib/libc/stdlib/Makefile.inc,v 1.54 2007/07/04 00:00:39 scf Exp $
+# $FreeBSD: src/lib/libc/stdlib/Makefile.inc,v 1.55 2008/08/20 08:31:58 ed Exp $
 
 # machine-independent stdlib sources
 .PATH: ${.CURDIR}/${MACHINE_ARCH}/stdlib ${.CURDIR}/stdlib
 
 MISRCS+=_Exit.c a64l.c abort.c abs.c atexit.c atof.c atoi.c atol.c atoll.c \
 	bsearch.c div.c exit.c getenv.c getopt.c getopt_long.c \
-	getsubopt.c grantpt.c hcreate.c heapsort.c imaxabs.c imaxdiv.c \
+	getsubopt.c hcreate.c heapsort.c imaxabs.c imaxdiv.c \
 	insque.c l64a.c labs.c ldiv.c llabs.c lldiv.c lsearch.c malloc.c \
-	merge.c qsort.c qsort_r.c radixsort.c rand.c random.c \
+	merge.c ptsname.c qsort.c qsort_r.c radixsort.c rand.c random.c \
 	reallocf.c realpath.c remque.c strfmon.c strtoimax.c \
 	strtol.c strtoll.c strtoq.c strtoul.c strtonum.c strtoull.c \
         strtoumax.c strtouq.c system.c tdelete.c tfind.c tsearch.c twalk.c
@@ -21,10 +21,10 @@
 .endif
 
 MAN+=	a64l.3 abort.3 abs.3 alloca.3 atexit.3 atof.3 atoi.3 atol.3 bsearch.3 \
-	div.3 exit.3 getenv.3 getopt.3 getopt_long.3 getsubopt.3 grantpt.3 \
+	div.3 exit.3 getenv.3 getopt.3 getopt_long.3 getsubopt.3 \
 	hcreate.3 imaxabs.3 imaxdiv.3 insque.3 labs.3 ldiv.3 llabs.3 lldiv.3 \
-	lsearch.3 malloc.3 memory.3 posix_memalign.3 qsort.3 radixsort.3 \
-	rand.3 random.3 \
+	lsearch.3 malloc.3 memory.3 posix_memalign.3 ptsname.3 qsort.3 \
+	radixsort.3 rand.3 random.3 \
 	realpath.3 strfmon.3 strtod.3 strtol.3 strtonum.3 strtoul.3 system.3 \
 	tsearch.3
 
@@ -33,10 +33,10 @@
 MLINKS+=exit.3 _Exit.3
 MLINKS+=getenv.3 putenv.3 getenv.3 setenv.3 getenv.3 unsetenv.3
 MLINKS+=getopt_long.3 getopt_long_only.3
-MLINKS+=grantpt.3 posix_openpt.3 grantpt.3 ptsname.3 grantpt.3 unlockpt.3
 MLINKS+=hcreate.3 hdestroy.3 hcreate.3 hsearch.3
 MLINKS+=insque.3 remque.3
 MLINKS+=lsearch.3 lfind.3
+MLINKS+=ptsname.3 grantpt.3 ptsname.3 unlockpt.3
 MLINKS+=qsort.3 heapsort.3 qsort.3 mergesort.3 qsort.3 qsort_r.3
 MLINKS+=rand.3 rand_r.3 rand.3 srand.3 rand.3 sranddev.3
 MLINKS+=random.3 initstate.3 random.3 setstate.3 random.3 srandom.3 \

==== //depot/projects/soc2008/konrad_collation/libc/stdlib/Symbol.map#5 (text) ====

@@ -1,5 +1,5 @@
 /*
- * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.7 2008/06/17 14:05:03 ed Exp $
+ * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.9 2008/08/27 02:00:53 jasone Exp $
  */
 
 FBSD_1.0 {
@@ -30,7 +30,6 @@
 	suboptarg;
 	getsubopt;
 	grantpt;
-	posix_openpt;
 	ptsname;
 	unlockpt;
 	hcreate;
@@ -94,6 +93,7 @@
 };
 
 FBSDprivate_1.0 {
+	_malloc_thread_cleanup;
 	_malloc_prefork;
 	_malloc_postfork;
 	__system;

==== //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.3#5 (text+ko) ====

@@ -30,9 +30,9 @@
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
-.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.78 2008/02/17 17:09:24 jasone Exp $
+.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.79 2008/08/27 02:00:53 jasone Exp $
 .\"
-.Dd February 17, 2008
+.Dd August 26, 2008
 .Dt MALLOC 3
 .Os
 .Sh NAME
@@ -154,7 +154,7 @@
 implementation-dependent.
 .Sh TUNING
 Once, when the first call is made to one of these memory allocation
-routines, various flags will be set or reset, which affect the
+routines, various flags will be set or reset, which affects the
 workings of this allocator implementation.
 .Pp
 The
@@ -196,6 +196,11 @@
 Therefore, some applications may benefit from increasing or decreasing this
 threshold parameter.
 This option is not available for some configurations (non-PIC).
+.It C
+Double/halve the size of the maximum size class that is a multiple of the
+cacheline size (64).
+Above this size, subpage spacing (256 bytes) is used for size classes.
+The default value is 512 bytes.
 .It D
 Use
 .Xr sbrk 2
@@ -214,6 +219,16 @@
 The default is 512 pages per arena;
 .Ev MALLOC_OPTIONS=10f
 will prevent any dirty unused pages from accumulating.
+.It G
+When there are multiple threads, use thread-specific caching for objects that
+are smaller than one page.
+This option is enabled by default.
+Thread-specific caching allows many allocations to be satisfied without
+performing any thread synchronization, at the cost of increased memory use.
+See the
+.Dq R
+option for related tuning information.
+This option is not available for some configurations (non-PIC).
 .It J
 Each byte of new memory allocated by
 .Fn malloc ,
@@ -248,7 +263,7 @@
 acquiring memory.
 .It N
 Double/halve the number of arenas.
-The default number of arenas is four times the number of CPUs, or one if there
+The default number of arenas is two times the number of CPUs, or one if there
 is a single CPU.
 .It P
 Various statistics are printed at program exit via an
@@ -259,14 +274,18 @@
 Therefore, this option should only be used with care; it is primarily intended
 as a performance tuning aid during application development.
 .It Q
-Double/halve the size of the allocation quantum.
-The default quantum is the minimum allowed by the architecture (typically 8 or
-16 bytes).
-.It S
 Double/halve the size of the maximum size class that is a multiple of the
-quantum.
-Above this size, power-of-two spacing is used for size classes.
-The default value is 512 bytes.
+quantum (8 or 16 bytes, depending on architecture).
+Above this size, cacheline spacing is used for size classes.
+The default value is 128 bytes.
+.It R
+Double/halve magazine size, which approximately doubles/halves the number of
+rounds in each magazine.
+Magazines are used by the thread-specific caching machinery to acquire and
+release objects in bulk.
+Increasing the magazine size decreases locking overhead, at the expense of
+increased memory usage.
+This option is not available for some configurations (non-PIC).
 .It U
 Generate
 .Dq utrace
@@ -358,6 +377,13 @@
 However, it may make sense to reduce the number of arenas if an application
 does not make much use of the allocation functions.
 .Pp
+In addition to multiple arenas, this allocator supports thread-specific
+caching for small objects (smaller than one page), in order to make it
+possible to completely avoid synchronization for most small allocation requests.
+Such caching allows very fast allocation in the common case, but it increases
+memory usage and fragmentation, since a bounded number of objects can remain
+allocated in each thread cache.
+.Pp
 Memory is conceptually broken into equal-sized chunks, where the chunk size is
 a power of two that is greater than the page size.
 Chunks are always aligned to multiples of the chunk size.
@@ -366,7 +392,7 @@
 .Pp
 User objects are broken into three categories according to size: small, large,
 and huge.
-Small objects are no larger than one half of a page.
+Small objects are smaller than one page.
 Large objects are smaller than the chunk size.
 Huge objects are a multiple of the chunk size.
 Small and large objects are managed by arenas; huge objects are managed
@@ -378,23 +404,24 @@
 contiguous pages (unused, backing a set of small objects, or backing one large
 object).
 The combination of chunk alignment and chunk page maps makes it possible to
-determine all metadata regarding small and large allocations in
-constant and logarithmic time, respectively.
+determine all metadata regarding small and large allocations in constant time.
 .Pp
 Small objects are managed in groups by page runs.
 Each run maintains a bitmap that tracks which regions are in use.
-Allocation requests that are no more than half the quantum (see the
+Allocation requests that are no more than half the quantum (8 or 16, depending
+on architecture) are rounded up to the nearest power of two.
+Allocation requests that are more than half the quantum, but no more than the
+minimum cacheline-multiple size class (see the
 .Dq Q
-option) are rounded up to the nearest power of two (typically 2, 4, or 8).
-Allocation requests that are more than half the quantum, but no more than the
-maximum quantum-multiple size class (see the
-.Dq S
 option) are rounded up to the nearest multiple of the quantum.
-Allocation requests that are larger than the maximum quantum-multiple size
-class, but no larger than one half of a page, are rounded up to the nearest
-power of two.
-Allocation requests that are larger than half of a page, but small enough to
-fit in an arena-managed chunk (see the
+Allocation requests that are more than the minumum cacheline-multiple size
+class, but no more than the minimum subpage-multiple size class (see the
+.Dq C
+option) are rounded up to the nearest multiple of the cacheline size (64).
+Allocation requests that are more than the minimum subpage-multiple size class
+are rounded up to the nearest multiple of the subpage size (256).
+Allocation requests that are more than one page, but small enough to fit in
+an arena-managed chunk (see the
 .Dq K
 option), are rounded up to the nearest run size.
 Allocation requests that are too large to fit in an arena-managed chunk are
@@ -402,8 +429,8 @@
 .Pp
 Allocations are packed tightly together, which can be an issue for
 multi-threaded applications.
-If you need to assure that allocations do not suffer from cache line sharing,
-round your allocation requests up to the nearest multiple of the cache line
+If you need to assure that allocations do not suffer from cacheline sharing,
+round your allocation requests up to the nearest multiple of the cacheline
 size.
 .Sh DEBUGGING MALLOC PROBLEMS
 The first thing to do is to set the

==== //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.c#6 (text+ko) ====

@@ -35,6 +35,9 @@
  *   + Multiple arenas are used if there are multiple CPUs, which reduces lock
  *     contention and cache sloshing.
  *
+ *   + Thread-specific caching is used if there are multiple threads, which
+ *     reduces the amount of locking.
+ *
  *   + Cache line sharing between arenas is avoided for internal data
  *     structures.
  *
@@ -48,37 +51,49 @@
  * and a 16 byte quantum on a 32-bit system, the size classes in each category
  * are as follows:
  *
- *   |=====================================|
- *   | Category | Subcategory    |    Size |
- *   |=====================================|
- *   | Small    | Tiny           |       2 |
- *   |          |                |       4 |
- *   |          |                |       8 |
- *   |          |----------------+---------|
- *   |          | Quantum-spaced |      16 |
- *   |          |                |      32 |
- *   |          |                |      48 |
- *   |          |                |     ... |
- *   |          |                |     480 |
- *   |          |                |     496 |
- *   |          |                |     512 |
- *   |          |----------------+---------|
- *   |          | Sub-page       |    1 kB |
- *   |          |                |    2 kB |
- *   |=====================================|
- *   | Large                     |    4 kB |
- *   |                           |    8 kB |
- *   |                           |   12 kB |
- *   |                           |     ... |
- *   |                           | 1012 kB |
- *   |                           | 1016 kB |
- *   |                           | 1020 kB |
- *   |=====================================|
- *   | Huge                      |    1 MB |
- *   |                           |    2 MB |
- *   |                           |    3 MB |
- *   |                           |     ... |
- *   |=====================================|
+ *   |=======================================|
+ *   | Category | Subcategory      |    Size |
+ *   |=======================================|
+ *   | Small    | Tiny             |       2 |
+ *   |          |                  |       4 |
+ *   |          |                  |       8 |
+ *   |          |------------------+---------|
+ *   |          | Quantum-spaced   |      16 |
+ *   |          |                  |      32 |
+ *   |          |                  |      48 |
+ *   |          |                  |     ... |
+ *   |          |                  |      96 |
+ *   |          |                  |     112 |
+ *   |          |                  |     128 |
+ *   |          |------------------+---------|
+ *   |          | Cacheline-spaced |     192 |
+ *   |          |                  |     256 |
+ *   |          |                  |     320 |
+ *   |          |                  |     384 |
+ *   |          |                  |     448 |
+ *   |          |                  |     512 |
+ *   |          |------------------+---------|
+ *   |          | Sub-page         |     760 |
+ *   |          |                  |    1024 |
+ *   |          |                  |    1280 |
+ *   |          |                  |     ... |
+ *   |          |                  |    3328 |
+ *   |          |                  |    3584 |
+ *   |          |                  |    3840 |
+ *   |=======================================|
+ *   | Large                       |    4 kB |
+ *   |                             |    8 kB |
+ *   |                             |   12 kB |
+ *   |                             |     ... |
+ *   |                             | 1012 kB |
+ *   |                             | 1016 kB |
+ *   |                             | 1020 kB |
+ *   |=======================================|
+ *   | Huge                        |    1 MB |
+ *   |                             |    2 MB |
+ *   |                             |    3 MB |
+ *   |                             |     ... |
+ *   |=======================================|
  *
  * A different mechanism is used for each category:
  *
@@ -113,6 +128,19 @@
 #endif
 
 /*
+ * MALLOC_TINY enables support for tiny objects, which are smaller than one
+ * quantum.
+ */
+#define	MALLOC_TINY
+
+/*
+ * MALLOC_MAG enables a magazine-based thread-specific caching layer for small
+ * objects.  This makes it possible to allocate/deallocate objects without any
+ * locking when the cache is in the steady state.
+ */
+#define	MALLOC_MAG
+
+/*
  * MALLOC_BALANCE enables monitoring of arena lock contention and dynamically
  * re-balances arena load if exponentially averaged contention exceeds a
  * certain threshold.
@@ -128,7 +156,7 @@
 #define	MALLOC_DSS
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.176 2008/08/14 17:31:42 jasone Exp $");
+__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.177 2008/08/27 02:00:53 jasone Exp $");
 
 #include "libc_private.h"
 #ifdef MALLOC_DEBUG
@@ -184,46 +212,63 @@
 /* Size of stack-allocated buffer passed to strerror_r(). */
 #define	STRERROR_BUF		64
 
-/* Minimum alignment of allocations is 2^QUANTUM_2POW_MIN bytes. */
+/*
+ * The const_size2bin table is sized according to PAGESIZE_2POW, but for
+ * correctness reasons, we never assume that
+ * (pagesize == (1U << * PAGESIZE_2POW)).
+ *
+ * Minimum alignment of allocations is 2^QUANTUM_2POW bytes.
+ */
 #ifdef __i386__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	2
 #  define CPU_SPINWAIT		__asm__ volatile("pause")
 #endif
 #ifdef __ia64__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #endif
 #ifdef __alpha__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		13
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #  define NO_TLS
 #endif
 #ifdef __sparc64__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		13
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #  define NO_TLS
 #endif
 #ifdef __amd64__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #  define CPU_SPINWAIT		__asm__ volatile("pause")
 #endif
 #ifdef __arm__
-#  define QUANTUM_2POW_MIN	3
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		3
 #  define SIZEOF_PTR_2POW	2
 #  define NO_TLS
 #endif
 #ifdef __mips__
-#  define QUANTUM_2POW_MIN	3
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		3
 #  define SIZEOF_PTR_2POW	2
 #  define NO_TLS
 #endif
 #ifdef __powerpc__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	2
 #endif
 
+#define	QUANTUM			((size_t)(1U << QUANTUM_2POW))
+#define	QUANTUM_MASK		(QUANTUM - 1)
+
 #define	SIZEOF_PTR		(1U << SIZEOF_PTR_2POW)
 
 /* sizeof(int) == (1U << SIZEOF_INT_2POW). */
@@ -237,6 +282,10 @@
 #endif
 
 #ifdef NO_TLS
+   /* MALLOC_MAG requires TLS. */
+#  ifdef MALLOC_MAG
+#    undef MALLOC_MAG
+#  endif
    /* MALLOC_BALANCE requires TLS. */
 #  ifdef MALLOC_BALANCE
 #    undef MALLOC_BALANCE
@@ -253,23 +302,42 @@
 #define	DIRTY_MAX_DEFAULT	(1U << 9)
 
 /*
- * Maximum size of L1 cache line.  This is used to avoid cache line aliasing,
- * so over-estimates are okay (up to a point), but under-estimates will
- * negatively affect performance.
+ * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
+ * In addition, this controls the spacing of cacheline-spaced size classes.
  */
 #define	CACHELINE_2POW		6
 #define	CACHELINE		((size_t)(1U << CACHELINE_2POW))
+#define	CACHELINE_MASK		(CACHELINE - 1)
 
-/* Smallest size class to support. */
-#define	TINY_MIN_2POW		1
+/*
+ * Subpages are an artificially designated partitioning of pages.  Their only
+ * purpose is to support subpage-spaced size classes.
+ *
+ * There must be at least 4 subpages per page, due to the way size classes are
+ * handled.
+ */
+#define	SUBPAGE_2POW		8
+#define	SUBPAGE			((size_t)(1U << SUBPAGE_2POW))
+#define	SUBPAGE_MASK		(SUBPAGE - 1)
+
+#ifdef MALLOC_TINY
+   /* Smallest size class to support. */
+#  define TINY_MIN_2POW		1
+#endif
 
 /*
  * Maximum size class that is a multiple of the quantum, but not (necessarily)
  * a power of 2.  Above this size, allocations are rounded up to the nearest
  * power of 2.
  */
-#define	SMALL_MAX_2POW_DEFAULT	9
-#define	SMALL_MAX_DEFAULT	(1U << SMALL_MAX_2POW_DEFAULT)
+#define	QSPACE_MAX_2POW_DEFAULT	7
+
+/*
+ * Maximum size class that is a multiple of the cacheline, but not (necessarily)
+ * a power of 2.  Above this size, allocations are rounded up to the nearest
+ * power of 2.
+ */
+#define	CSPACE_MAX_2POW_DEFAULT	9
 
 /*
  * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
@@ -293,8 +361,7 @@
 #define	RUN_MAX_OVRHD_RELAX	0x00001800U
 
 /* Put a cap on small object run size.  This overrides RUN_MAX_OVRHD. */
-#define	RUN_MAX_SMALL_2POW	15
-#define	RUN_MAX_SMALL		(1U << RUN_MAX_SMALL_2POW)
+#define	RUN_MAX_SMALL	(12 * pagesize)
 
 /*
  * Hyper-threaded CPUs may need a special instruction inside spin loops in
@@ -319,6 +386,15 @@
  */
 #define	BLOCK_COST_2POW		4
 
+#ifdef MALLOC_MAG
+   /*
+    * Default magazine size, in bytes.  max_rounds is calculated to make
+    * optimal use of the space, leaving just enough room for the magazine
+    * header.
+    */
+#  define MAG_SIZE_2POW_DEFAULT	9
+#endif
+
 #ifdef MALLOC_BALANCE
    /*
     * We use an exponential moving average to track recent lock contention,
@@ -369,6 +445,11 @@
 	 */
 	uint64_t	nrequests;
 
+#ifdef MALLOC_MAG
+	/* Number of magazine reloads from this bin. */
+	uint64_t	nmags;
+#endif
+
 	/* Total number of runs created for this bin's size class. */
 	uint64_t	nruns;
 
@@ -678,6 +759,35 @@
 
 /******************************************************************************/
 /*
+ * Magazine data structures.
+ */
+
+#ifdef MALLOC_MAG
+typedef struct mag_s mag_t;
+struct mag_s {
+	size_t		binind; /* Index of associated bin. */
+	size_t		nrounds;
+	void		*rounds[1]; /* Dynamically sized. */
+};
+
+/*
+ * Magazines are lazily allocated, but once created, they remain until the
+ * associated mag_rack is destroyed.
+ */
+typedef struct bin_mags_s bin_mags_t;
+struct bin_mags_s {
+	mag_t	*curmag;
+	mag_t	*sparemag;
+};
+
+typedef struct mag_rack_s mag_rack_t;
+struct mag_rack_s {
+	bin_mags_t	bin_mags[1]; /* Dynamically sized. */
+};
+#endif
+
+/******************************************************************************/
+/*
  * Data.
  */
 
@@ -690,16 +800,147 @@
 static size_t		pagesize_2pow;
 
 /* Various bin-related settings. */
-static size_t		bin_maxclass; /* Max size class for bins. */
-static unsigned		ntbins; /* Number of (2^n)-spaced tiny bins. */
+#ifdef MALLOC_TINY		/* Number of (2^n)-spaced tiny bins. */
+#  define		ntbins	((unsigned)(QUANTUM_2POW - TINY_MIN_2POW))
+#else
+#  define		ntbins	0
+#endif
 static unsigned		nqbins; /* Number of quantum-spaced bins. */
-static unsigned		nsbins; /* Number of (2^n)-spaced sub-page bins. */
-static size_t		small_min;
-static size_t		small_max;
+static unsigned		ncbins; /* Number of cacheline-spaced bins. */
+static unsigned		nsbins; /* Number of subpage-spaced bins. */
+static unsigned		nbins;
+#ifdef MALLOC_TINY
+#  define		tspace_max	((size_t)(QUANTUM >> 1))
+#endif
+#define			qspace_min	QUANTUM
+static size_t		qspace_max;
+static size_t		cspace_min;
+static size_t		cspace_max;
+static size_t		sspace_min;
+static size_t		sspace_max;
+#define			bin_maxclass	sspace_max
+
+static uint8_t const	*size2bin;
+/*
+ * const_size2bin is a static constant lookup table that in the common case can
+ * be used as-is for size2bin.  For dynamically linked programs, this avoids
+ * a page of memory overhead per process.
+ */
+#define	S2B_1(i)	i,
+#define	S2B_2(i)	S2B_1(i) S2B_1(i)
+#define	S2B_4(i)	S2B_2(i) S2B_2(i)
+#define	S2B_8(i)	S2B_4(i) S2B_4(i)
+#define	S2B_16(i)	S2B_8(i) S2B_8(i)
+#define	S2B_32(i)	S2B_16(i) S2B_16(i)
+#define	S2B_64(i)	S2B_32(i) S2B_32(i)
+#define	S2B_128(i)	S2B_64(i) S2B_64(i)
+#define	S2B_256(i)	S2B_128(i) S2B_128(i)
+static const uint8_t	const_size2bin[(1U << PAGESIZE_2POW) - 255] = {
+	S2B_1(0xffU)		/*    0 */
+#if (QUANTUM_2POW == 4)
+/* 64-bit system ************************/
+#  ifdef MALLOC_TINY
+	S2B_2(0)		/*    2 */
+	S2B_2(1)		/*    4 */
+	S2B_4(2)		/*    8 */
+	S2B_8(3)		/*   16 */
+#    define S2B_QMIN 3
+#  else
+	S2B_16(0)		/*   16 */
+#    define S2B_QMIN 0
+#  endif
+	S2B_16(S2B_QMIN + 1)	/*   32 */
+	S2B_16(S2B_QMIN + 2)	/*   48 */
+	S2B_16(S2B_QMIN + 3)	/*   64 */
+	S2B_16(S2B_QMIN + 4)	/*   80 */
+	S2B_16(S2B_QMIN + 5)	/*   96 */
+	S2B_16(S2B_QMIN + 6)	/*  112 */
+	S2B_16(S2B_QMIN + 7)	/*  128 */
+#  define S2B_CMIN (S2B_QMIN + 8)
+#else
+/* 32-bit system ************************/
+#  ifdef MALLOC_TINY
+	S2B_2(0)		/*    2 */
+	S2B_2(1)		/*    4 */
+	S2B_4(2)		/*    8 */
+#    define S2B_QMIN 2
+#  else
+	S2B_8(0)		/*    8 */
+#    define S2B_QMIN 0
+#  endif
+	S2B_8(S2B_QMIN + 1)	/*   16 */
+	S2B_8(S2B_QMIN + 2)	/*   24 */
+	S2B_8(S2B_QMIN + 3)	/*   32 */
+	S2B_8(S2B_QMIN + 4)	/*   40 */
+	S2B_8(S2B_QMIN + 5)	/*   48 */
+	S2B_8(S2B_QMIN + 6)	/*   56 */
+	S2B_8(S2B_QMIN + 7)	/*   64 */
+	S2B_8(S2B_QMIN + 8)	/*   72 */
+	S2B_8(S2B_QMIN + 9)	/*   80 */
+	S2B_8(S2B_QMIN + 10)	/*   88 */
+	S2B_8(S2B_QMIN + 11)	/*   96 */
+	S2B_8(S2B_QMIN + 12)	/*  104 */
+	S2B_8(S2B_QMIN + 13)	/*  112 */
+	S2B_8(S2B_QMIN + 14)	/*  120 */
+	S2B_8(S2B_QMIN + 15)	/*  128 */
+#  define S2B_CMIN (S2B_QMIN + 16)
+#endif
+/****************************************/
+	S2B_64(S2B_CMIN + 0)	/*  192 */
+	S2B_64(S2B_CMIN + 1)	/*  256 */
+	S2B_64(S2B_CMIN + 2)	/*  320 */
+	S2B_64(S2B_CMIN + 3)	/*  384 */
+	S2B_64(S2B_CMIN + 4)	/*  448 */
+	S2B_64(S2B_CMIN + 5)	/*  512 */
+#  define S2B_SMIN (S2B_CMIN + 6)
+	S2B_256(S2B_SMIN + 0)	/*  768 */
+	S2B_256(S2B_SMIN + 1)	/* 1024 */
+	S2B_256(S2B_SMIN + 2)	/* 1280 */
+	S2B_256(S2B_SMIN + 3)	/* 1536 */
+	S2B_256(S2B_SMIN + 4)	/* 1792 */
+	S2B_256(S2B_SMIN + 5)	/* 2048 */
+	S2B_256(S2B_SMIN + 6)	/* 2304 */
+	S2B_256(S2B_SMIN + 7)	/* 2560 */
+	S2B_256(S2B_SMIN + 8)	/* 2816 */
+	S2B_256(S2B_SMIN + 9)	/* 3072 */
+	S2B_256(S2B_SMIN + 10)	/* 3328 */
+	S2B_256(S2B_SMIN + 11)	/* 3584 */
+	S2B_256(S2B_SMIN + 12)	/* 3840 */
+#if (PAGESIZE_2POW == 13)
+	S2B_256(S2B_SMIN + 13)	/* 4096 */
+	S2B_256(S2B_SMIN + 14)	/* 4352 */
+	S2B_256(S2B_SMIN + 15)	/* 4608 */
+	S2B_256(S2B_SMIN + 16)	/* 4864 */
+	S2B_256(S2B_SMIN + 17)	/* 5120 */
+	S2B_256(S2B_SMIN + 18)	/* 5376 */
+	S2B_256(S2B_SMIN + 19)	/* 5632 */
+	S2B_256(S2B_SMIN + 20)	/* 5888 */
+	S2B_256(S2B_SMIN + 21)	/* 6144 */
+	S2B_256(S2B_SMIN + 22)	/* 6400 */
+	S2B_256(S2B_SMIN + 23)	/* 6656 */
+	S2B_256(S2B_SMIN + 24)	/* 6912 */
+	S2B_256(S2B_SMIN + 25)	/* 7168 */
+	S2B_256(S2B_SMIN + 26)	/* 7424 */
+	S2B_256(S2B_SMIN + 27)	/* 7680 */
+	S2B_256(S2B_SMIN + 28)	/* 7936 */
+#endif
+};
+#undef S2B_1
+#undef S2B_2
+#undef S2B_4
+#undef S2B_8
+#undef S2B_16
+#undef S2B_32
+#undef S2B_64
+#undef S2B_128
+#undef S2B_256
+#undef S2B_QMIN
+#undef S2B_CMIN
+#undef S2B_SMIN
 
-/* Various quantum-related settings. */
-static size_t		quantum;
-static size_t		quantum_mask; /* (quantum - 1). */
+#ifdef MALLOC_MAG
+static size_t		max_rounds;
+#endif
 
 /* Various chunk-related settings. */
 static size_t		chunksize;
@@ -796,6 +1037,14 @@
 static __thread arena_t	*arenas_map;
 #endif
 
+#ifdef MALLOC_MAG
+/*
+ * Map of thread-specific magazine racks, used for thread-specific object
+ * caching.
+ */
+static __thread mag_rack_t	*mag_rack;
+#endif
+
 #ifdef MALLOC_STATS
 /* Chunk statistics. */
 static chunk_stats_t	stats_chunks;
@@ -818,13 +1067,17 @@
 static bool	opt_dss = true;
 static bool	opt_mmap = true;
 #endif
+#ifdef MALLOC_MAG
+static bool	opt_mag = true;
+static size_t	opt_mag_size_2pow = MAG_SIZE_2POW_DEFAULT;
+#endif
 static size_t	opt_dirty_max = DIRTY_MAX_DEFAULT;
 #ifdef MALLOC_BALANCE
 static uint64_t	opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
 #endif
 static bool	opt_print_stats = false;
-static size_t	opt_quantum_2pow = QUANTUM_2POW_MIN;
-static size_t	opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
+static size_t	opt_qspace_max_2pow = QSPACE_MAX_2POW_DEFAULT;
+static size_t	opt_cspace_max_2pow = CSPACE_MAX_2POW_DEFAULT;
 static size_t	opt_chunk_2pow = CHUNK_2POW_DEFAULT;
 static bool	opt_utrace = false;
 static bool	opt_sysv = false;
@@ -902,15 +1155,21 @@
 static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
-static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
-static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
+static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
+static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
 #ifdef MALLOC_BALANCE
 static void	arena_lock_balance_hard(arena_t *arena);
 #endif
+#ifdef MALLOC_MAG
+static void	mag_load(mag_t *mag);
+#endif
 static void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 static void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
     size_t alloc_size);
 static size_t	arena_salloc(const void *ptr);
+#ifdef MALLOC_MAG
+static void	mag_unload(mag_t *mag);
+#endif
 static void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk,
     void *ptr);
 static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
@@ -921,11 +1180,22 @@
 static void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
 static bool	arena_new(arena_t *arena);
 static arena_t	*arenas_extend(unsigned ind);
+#ifdef MALLOC_MAG
+static mag_t	*mag_create(arena_t *arena, size_t binind);
+static void	mag_destroy(mag_t *mag);
+static mag_rack_t *mag_rack_create(arena_t *arena);

>>> TRUNCATED FOR MAIL (1000 lines) <<<


More information about the p4-projects mailing list