PERFORCE change 149020 for review
Konrad Jankowski
konrad at FreeBSD.org
Mon Sep 1 21:08:04 UTC 2008
http://perforce.freebsd.org/chv.cgi?CH=149020
Change 149020 by konrad at vspredator on 2008/09/01 21:07:06
ifc
Affected files ...
.. //depot/projects/soc2008/konrad_collation/libc/include/libc_private.h#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/locale/collate.c#9 edit
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/Makefile.inc#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/Symbol.map#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.3#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.c#6 integrate
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/ptsname.3#1 branch
.. //depot/projects/soc2008/konrad_collation/libc/stdlib/ptsname.c#1 branch
.. //depot/projects/soc2008/konrad_collation/libc/sys/Makefile.inc#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/Symbol.map#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/execve.2#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/getrlimit.2#5 integrate
.. //depot/projects/soc2008/konrad_collation/libc/sys/posix_openpt.2#1 branch
.. //depot/projects/soc2008/konrad_collation/libc/sys/wait.2#5 integrate
Differences ...
==== //depot/projects/soc2008/konrad_collation/libc/include/libc_private.h#5 (text+ko) ====
@@ -26,7 +26,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.19 2008/06/23 05:22:06 ed Exp $
+ * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.20 2008/08/27 02:00:53 jasone Exp $
*
* Private definitions for libc, libc_r and libpthread.
*
@@ -158,6 +158,12 @@
extern const char *__progname;
/*
+ * This function is used by the threading libraries to notify malloc that a
+ * thread is exiting.
+ */
+void _malloc_thread_cleanup(void);
+
+/*
* These functions are used by the threading libraries in order to protect
* malloc across fork().
*/
==== //depot/projects/soc2008/konrad_collation/libc/locale/collate.c#9 (text+ko) ====
@@ -128,7 +128,7 @@
}
#if _BYTE_ORDER == _LITTLE_ENDIAN
for(z = 0; z < info.directive_count; z++) {
- info.undef_pri[z] = ntohl(info.undef_pri[z]);
+ info.undef_pri.pri[z] = ntohl(info.undef_pri.pri[z]);
info.subst_count[z] = ntohl(info.subst_count[z]);
}
info.chain_count = ntohl(info.chain_count);
@@ -220,10 +220,11 @@
}
{
struct __collate_st_chain_pri *p = TMP->__chain_pri_table;
+
for(i = chains; i-- > 0; p++) {
wntohl(p->str, STR_LEN);
for(z = 0; z < info.directive_count; z++)
- p->pri[z] = ntohl(p->pri[z]);
+ p->pri.pri[z] = ntohl(p->pri.pri[z]);
}
}
if (info.large_pri_count > 0) {
@@ -239,7 +240,7 @@
(void)strcpy(TMP->__encoding, encoding);
(void)memcpy(&TMP->__info, &info, sizeof(info));
__collate_data = TMP;
-
+
__collate_load_error = (info.subst_count[0] > 0 ||
info.subst_count[1] > 0);
__collate_load_error = 0;
@@ -328,6 +329,7 @@
s++;
}
dest_str[len] = 0;
+
return (dest_str);
}
@@ -357,9 +359,11 @@
else
high = next - 1;
}
+
return NULL;
}
+/* XXX maybe just return struct __collate_st_char_pri? */
static struct __collate_st_large_char_pri *
largesearch(const wchar_t key)
{
@@ -381,15 +385,21 @@
else
high = next - 1;
}
+
return NULL;
}
+#if 0
void
__collate_lookup(const wchar_t *t, int *len, int *prim, int *sec)
{
struct __collate_st_chain_pri *p2;
int l;
+ /*
+ * I didn't fix this function, as it is anyway uselesess in multiple
+ * weight collation.
+ */
*len = 1;
*prim = *sec = 0;
p2 = chainsearch(t, &l);
@@ -417,38 +427,52 @@
*prim = (l = __collate_info->undef_pri[0]) >= 0 ? l : *t - l;
*sec = (l = __collate_info->undef_pri[1]) >= 0 ? l : *t - l;
}
+#endif
-void
-__collate_lookup_which(const wchar_t *t, int *len, int *pri, int which)
+struct __collate_st_char_pri *
+__collate_lookup(const wchar_t *t, int *len)
{
struct __collate_st_chain_pri *p2;
- int p, l;
+ struct __collate_st_large_char_pri *match;
+ int l;
*len = 1;
- *pri = 0;
p2 = chainsearch(t, &l);
if (p2) {
+ fprintf(stderr, "vsdebug: chainsearch succeeded");
+ *len = l;
+
+ return &p2->pri;
+#if 0
+ int p;
+
p = p2->pri[which];
/* use the chain if pri >= 0 */
if (p >= 0) {
*len = l;
*pri = p;
+
return;
}
+#endif
}
if (*t <= UCHAR_MAX) {
+#if 0
*pri = __collate_char_pri_table[*t].pri[which];
- return;
+#endif
+ return &__collate_char_pri_table[*t];
}
if (__collate_info->large_pri_count > 0) {
- struct __collate_st_large_char_pri *match;
+
match = largesearch(*t);
if (match) {
- *pri = match->pri.pri[which];
- return;
+ return &match->pri;
}
}
+#if 0
*pri = (l = __collate_info->undef_pri[which]) >= 0 ? l : *t - l;
+#endif
+ return &__collate_info->undef_pri;
}
wchar_t *
@@ -482,6 +506,7 @@
if ((wcs = (wchar_t *)malloc(len * sizeof(wchar_t))) == NULL)
__collate_err(EX_OSERR, __func__);
wcscpy(wcs, s);
+
return (wcs);
}
@@ -722,9 +747,10 @@
if (__collate_info->chain_count > 0) {
struct __collate_st_chain_pri *match;
int ll;
+
match = chainsearch(wname, &ll);
if (match) {
- e = match->pri[0];
+ e = match->pri.pri[0];
if (e == 0)
return IGNORE_EQUIV_CLASS;
return e < 0 ? -e : e;
@@ -786,7 +812,8 @@
}
*wp = 0;
if (len > 1 && (ch = chainsearch(buf, &i)) != NULL) {
- int e = ch->pri[0];
+ int e = ch->pri.pri[0];
+
if (e < 0)
e = -e;
if (e == equiv_class)
==== //depot/projects/soc2008/konrad_collation/libc/stdlib/Makefile.inc#5 (text+ko) ====
@@ -1,14 +1,14 @@
# from @(#)Makefile.inc 8.3 (Berkeley) 2/4/95
-# $FreeBSD: src/lib/libc/stdlib/Makefile.inc,v 1.54 2007/07/04 00:00:39 scf Exp $
+# $FreeBSD: src/lib/libc/stdlib/Makefile.inc,v 1.55 2008/08/20 08:31:58 ed Exp $
# machine-independent stdlib sources
.PATH: ${.CURDIR}/${MACHINE_ARCH}/stdlib ${.CURDIR}/stdlib
MISRCS+=_Exit.c a64l.c abort.c abs.c atexit.c atof.c atoi.c atol.c atoll.c \
bsearch.c div.c exit.c getenv.c getopt.c getopt_long.c \
- getsubopt.c grantpt.c hcreate.c heapsort.c imaxabs.c imaxdiv.c \
+ getsubopt.c hcreate.c heapsort.c imaxabs.c imaxdiv.c \
insque.c l64a.c labs.c ldiv.c llabs.c lldiv.c lsearch.c malloc.c \
- merge.c qsort.c qsort_r.c radixsort.c rand.c random.c \
+ merge.c ptsname.c qsort.c qsort_r.c radixsort.c rand.c random.c \
reallocf.c realpath.c remque.c strfmon.c strtoimax.c \
strtol.c strtoll.c strtoq.c strtoul.c strtonum.c strtoull.c \
strtoumax.c strtouq.c system.c tdelete.c tfind.c tsearch.c twalk.c
@@ -21,10 +21,10 @@
.endif
MAN+= a64l.3 abort.3 abs.3 alloca.3 atexit.3 atof.3 atoi.3 atol.3 bsearch.3 \
- div.3 exit.3 getenv.3 getopt.3 getopt_long.3 getsubopt.3 grantpt.3 \
+ div.3 exit.3 getenv.3 getopt.3 getopt_long.3 getsubopt.3 \
hcreate.3 imaxabs.3 imaxdiv.3 insque.3 labs.3 ldiv.3 llabs.3 lldiv.3 \
- lsearch.3 malloc.3 memory.3 posix_memalign.3 qsort.3 radixsort.3 \
- rand.3 random.3 \
+ lsearch.3 malloc.3 memory.3 posix_memalign.3 ptsname.3 qsort.3 \
+ radixsort.3 rand.3 random.3 \
realpath.3 strfmon.3 strtod.3 strtol.3 strtonum.3 strtoul.3 system.3 \
tsearch.3
@@ -33,10 +33,10 @@
MLINKS+=exit.3 _Exit.3
MLINKS+=getenv.3 putenv.3 getenv.3 setenv.3 getenv.3 unsetenv.3
MLINKS+=getopt_long.3 getopt_long_only.3
-MLINKS+=grantpt.3 posix_openpt.3 grantpt.3 ptsname.3 grantpt.3 unlockpt.3
MLINKS+=hcreate.3 hdestroy.3 hcreate.3 hsearch.3
MLINKS+=insque.3 remque.3
MLINKS+=lsearch.3 lfind.3
+MLINKS+=ptsname.3 grantpt.3 ptsname.3 unlockpt.3
MLINKS+=qsort.3 heapsort.3 qsort.3 mergesort.3 qsort.3 qsort_r.3
MLINKS+=rand.3 rand_r.3 rand.3 srand.3 rand.3 sranddev.3
MLINKS+=random.3 initstate.3 random.3 setstate.3 random.3 srandom.3 \
==== //depot/projects/soc2008/konrad_collation/libc/stdlib/Symbol.map#5 (text) ====
@@ -1,5 +1,5 @@
/*
- * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.7 2008/06/17 14:05:03 ed Exp $
+ * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.9 2008/08/27 02:00:53 jasone Exp $
*/
FBSD_1.0 {
@@ -30,7 +30,6 @@
suboptarg;
getsubopt;
grantpt;
- posix_openpt;
ptsname;
unlockpt;
hcreate;
@@ -94,6 +93,7 @@
};
FBSDprivate_1.0 {
+ _malloc_thread_cleanup;
_malloc_prefork;
_malloc_postfork;
__system;
==== //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.3#5 (text+ko) ====
@@ -30,9 +30,9 @@
.\" SUCH DAMAGE.
.\"
.\" @(#)malloc.3 8.1 (Berkeley) 6/4/93
-.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.78 2008/02/17 17:09:24 jasone Exp $
+.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.79 2008/08/27 02:00:53 jasone Exp $
.\"
-.Dd February 17, 2008
+.Dd August 26, 2008
.Dt MALLOC 3
.Os
.Sh NAME
@@ -154,7 +154,7 @@
implementation-dependent.
.Sh TUNING
Once, when the first call is made to one of these memory allocation
-routines, various flags will be set or reset, which affect the
+routines, various flags will be set or reset, which affects the
workings of this allocator implementation.
.Pp
The
@@ -196,6 +196,11 @@
Therefore, some applications may benefit from increasing or decreasing this
threshold parameter.
This option is not available for some configurations (non-PIC).
+.It C
+Double/halve the size of the maximum size class that is a multiple of the
+cacheline size (64).
+Above this size, subpage spacing (256 bytes) is used for size classes.
+The default value is 512 bytes.
.It D
Use
.Xr sbrk 2
@@ -214,6 +219,16 @@
The default is 512 pages per arena;
.Ev MALLOC_OPTIONS=10f
will prevent any dirty unused pages from accumulating.
+.It G
+When there are multiple threads, use thread-specific caching for objects that
+are smaller than one page.
+This option is enabled by default.
+Thread-specific caching allows many allocations to be satisfied without
+performing any thread synchronization, at the cost of increased memory use.
+See the
+.Dq R
+option for related tuning information.
+This option is not available for some configurations (non-PIC).
.It J
Each byte of new memory allocated by
.Fn malloc ,
@@ -248,7 +263,7 @@
acquiring memory.
.It N
Double/halve the number of arenas.
-The default number of arenas is four times the number of CPUs, or one if there
+The default number of arenas is two times the number of CPUs, or one if there
is a single CPU.
.It P
Various statistics are printed at program exit via an
@@ -259,14 +274,18 @@
Therefore, this option should only be used with care; it is primarily intended
as a performance tuning aid during application development.
.It Q
-Double/halve the size of the allocation quantum.
-The default quantum is the minimum allowed by the architecture (typically 8 or
-16 bytes).
-.It S
Double/halve the size of the maximum size class that is a multiple of the
-quantum.
-Above this size, power-of-two spacing is used for size classes.
-The default value is 512 bytes.
+quantum (8 or 16 bytes, depending on architecture).
+Above this size, cacheline spacing is used for size classes.
+The default value is 128 bytes.
+.It R
+Double/halve magazine size, which approximately doubles/halves the number of
+rounds in each magazine.
+Magazines are used by the thread-specific caching machinery to acquire and
+release objects in bulk.
+Increasing the magazine size decreases locking overhead, at the expense of
+increased memory usage.
+This option is not available for some configurations (non-PIC).
.It U
Generate
.Dq utrace
@@ -358,6 +377,13 @@
However, it may make sense to reduce the number of arenas if an application
does not make much use of the allocation functions.
.Pp
+In addition to multiple arenas, this allocator supports thread-specific
+caching for small objects (smaller than one page), in order to make it
+possible to completely avoid synchronization for most small allocation requests.
+Such caching allows very fast allocation in the common case, but it increases
+memory usage and fragmentation, since a bounded number of objects can remain
+allocated in each thread cache.
+.Pp
Memory is conceptually broken into equal-sized chunks, where the chunk size is
a power of two that is greater than the page size.
Chunks are always aligned to multiples of the chunk size.
@@ -366,7 +392,7 @@
.Pp
User objects are broken into three categories according to size: small, large,
and huge.
-Small objects are no larger than one half of a page.
+Small objects are smaller than one page.
Large objects are smaller than the chunk size.
Huge objects are a multiple of the chunk size.
Small and large objects are managed by arenas; huge objects are managed
@@ -378,23 +404,24 @@
contiguous pages (unused, backing a set of small objects, or backing one large
object).
The combination of chunk alignment and chunk page maps makes it possible to
-determine all metadata regarding small and large allocations in
-constant and logarithmic time, respectively.
+determine all metadata regarding small and large allocations in constant time.
.Pp
Small objects are managed in groups by page runs.
Each run maintains a bitmap that tracks which regions are in use.
-Allocation requests that are no more than half the quantum (see the
+Allocation requests that are no more than half the quantum (8 or 16, depending
+on architecture) are rounded up to the nearest power of two.
+Allocation requests that are more than half the quantum, but no more than the
+minimum cacheline-multiple size class (see the
.Dq Q
-option) are rounded up to the nearest power of two (typically 2, 4, or 8).
-Allocation requests that are more than half the quantum, but no more than the
-maximum quantum-multiple size class (see the
-.Dq S
option) are rounded up to the nearest multiple of the quantum.
-Allocation requests that are larger than the maximum quantum-multiple size
-class, but no larger than one half of a page, are rounded up to the nearest
-power of two.
-Allocation requests that are larger than half of a page, but small enough to
-fit in an arena-managed chunk (see the
+Allocation requests that are more than the minumum cacheline-multiple size
+class, but no more than the minimum subpage-multiple size class (see the
+.Dq C
+option) are rounded up to the nearest multiple of the cacheline size (64).
+Allocation requests that are more than the minimum subpage-multiple size class
+are rounded up to the nearest multiple of the subpage size (256).
+Allocation requests that are more than one page, but small enough to fit in
+an arena-managed chunk (see the
.Dq K
option), are rounded up to the nearest run size.
Allocation requests that are too large to fit in an arena-managed chunk are
@@ -402,8 +429,8 @@
.Pp
Allocations are packed tightly together, which can be an issue for
multi-threaded applications.
-If you need to assure that allocations do not suffer from cache line sharing,
-round your allocation requests up to the nearest multiple of the cache line
+If you need to assure that allocations do not suffer from cacheline sharing,
+round your allocation requests up to the nearest multiple of the cacheline
size.
.Sh DEBUGGING MALLOC PROBLEMS
The first thing to do is to set the
==== //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.c#6 (text+ko) ====
@@ -35,6 +35,9 @@
* + Multiple arenas are used if there are multiple CPUs, which reduces lock
* contention and cache sloshing.
*
+ * + Thread-specific caching is used if there are multiple threads, which
+ * reduces the amount of locking.
+ *
* + Cache line sharing between arenas is avoided for internal data
* structures.
*
@@ -48,37 +51,49 @@
* and a 16 byte quantum on a 32-bit system, the size classes in each category
* are as follows:
*
- * |=====================================|
- * | Category | Subcategory | Size |
- * |=====================================|
- * | Small | Tiny | 2 |
- * | | | 4 |
- * | | | 8 |
- * | |----------------+---------|
- * | | Quantum-spaced | 16 |
- * | | | 32 |
- * | | | 48 |
- * | | | ... |
- * | | | 480 |
- * | | | 496 |
- * | | | 512 |
- * | |----------------+---------|
- * | | Sub-page | 1 kB |
- * | | | 2 kB |
- * |=====================================|
- * | Large | 4 kB |
- * | | 8 kB |
- * | | 12 kB |
- * | | ... |
- * | | 1012 kB |
- * | | 1016 kB |
- * | | 1020 kB |
- * |=====================================|
- * | Huge | 1 MB |
- * | | 2 MB |
- * | | 3 MB |
- * | | ... |
- * |=====================================|
+ * |=======================================|
+ * | Category | Subcategory | Size |
+ * |=======================================|
+ * | Small | Tiny | 2 |
+ * | | | 4 |
+ * | | | 8 |
+ * | |------------------+---------|
+ * | | Quantum-spaced | 16 |
+ * | | | 32 |
+ * | | | 48 |
+ * | | | ... |
+ * | | | 96 |
+ * | | | 112 |
+ * | | | 128 |
+ * | |------------------+---------|
+ * | | Cacheline-spaced | 192 |
+ * | | | 256 |
+ * | | | 320 |
+ * | | | 384 |
+ * | | | 448 |
+ * | | | 512 |
+ * | |------------------+---------|
+ * | | Sub-page | 760 |
+ * | | | 1024 |
+ * | | | 1280 |
+ * | | | ... |
+ * | | | 3328 |
+ * | | | 3584 |
+ * | | | 3840 |
+ * |=======================================|
+ * | Large | 4 kB |
+ * | | 8 kB |
+ * | | 12 kB |
+ * | | ... |
+ * | | 1012 kB |
+ * | | 1016 kB |
+ * | | 1020 kB |
+ * |=======================================|
+ * | Huge | 1 MB |
+ * | | 2 MB |
+ * | | 3 MB |
+ * | | ... |
+ * |=======================================|
*
* A different mechanism is used for each category:
*
@@ -113,6 +128,19 @@
#endif
/*
+ * MALLOC_TINY enables support for tiny objects, which are smaller than one
+ * quantum.
+ */
+#define MALLOC_TINY
+
+/*
+ * MALLOC_MAG enables a magazine-based thread-specific caching layer for small
+ * objects. This makes it possible to allocate/deallocate objects without any
+ * locking when the cache is in the steady state.
+ */
+#define MALLOC_MAG
+
+/*
* MALLOC_BALANCE enables monitoring of arena lock contention and dynamically
* re-balances arena load if exponentially averaged contention exceeds a
* certain threshold.
@@ -128,7 +156,7 @@
#define MALLOC_DSS
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.176 2008/08/14 17:31:42 jasone Exp $");
+__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.177 2008/08/27 02:00:53 jasone Exp $");
#include "libc_private.h"
#ifdef MALLOC_DEBUG
@@ -184,46 +212,63 @@
/* Size of stack-allocated buffer passed to strerror_r(). */
#define STRERROR_BUF 64
-/* Minimum alignment of allocations is 2^QUANTUM_2POW_MIN bytes. */
+/*
+ * The const_size2bin table is sized according to PAGESIZE_2POW, but for
+ * correctness reasons, we never assume that
+ * (pagesize == (1U << * PAGESIZE_2POW)).
+ *
+ * Minimum alignment of allocations is 2^QUANTUM_2POW bytes.
+ */
#ifdef __i386__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 2
# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __ia64__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
#endif
#ifdef __alpha__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 13
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
# define NO_TLS
#endif
#ifdef __sparc64__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 13
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
# define NO_TLS
#endif
#ifdef __amd64__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __arm__
-# define QUANTUM_2POW_MIN 3
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 3
# define SIZEOF_PTR_2POW 2
# define NO_TLS
#endif
#ifdef __mips__
-# define QUANTUM_2POW_MIN 3
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 3
# define SIZEOF_PTR_2POW 2
# define NO_TLS
#endif
#ifdef __powerpc__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 2
#endif
+#define QUANTUM ((size_t)(1U << QUANTUM_2POW))
+#define QUANTUM_MASK (QUANTUM - 1)
+
#define SIZEOF_PTR (1U << SIZEOF_PTR_2POW)
/* sizeof(int) == (1U << SIZEOF_INT_2POW). */
@@ -237,6 +282,10 @@
#endif
#ifdef NO_TLS
+ /* MALLOC_MAG requires TLS. */
+# ifdef MALLOC_MAG
+# undef MALLOC_MAG
+# endif
/* MALLOC_BALANCE requires TLS. */
# ifdef MALLOC_BALANCE
# undef MALLOC_BALANCE
@@ -253,23 +302,42 @@
#define DIRTY_MAX_DEFAULT (1U << 9)
/*
- * Maximum size of L1 cache line. This is used to avoid cache line aliasing,
- * so over-estimates are okay (up to a point), but under-estimates will
- * negatively affect performance.
+ * Maximum size of L1 cache line. This is used to avoid cache line aliasing.
+ * In addition, this controls the spacing of cacheline-spaced size classes.
*/
#define CACHELINE_2POW 6
#define CACHELINE ((size_t)(1U << CACHELINE_2POW))
+#define CACHELINE_MASK (CACHELINE - 1)
-/* Smallest size class to support. */
-#define TINY_MIN_2POW 1
+/*
+ * Subpages are an artificially designated partitioning of pages. Their only
+ * purpose is to support subpage-spaced size classes.
+ *
+ * There must be at least 4 subpages per page, due to the way size classes are
+ * handled.
+ */
+#define SUBPAGE_2POW 8
+#define SUBPAGE ((size_t)(1U << SUBPAGE_2POW))
+#define SUBPAGE_MASK (SUBPAGE - 1)
+
+#ifdef MALLOC_TINY
+ /* Smallest size class to support. */
+# define TINY_MIN_2POW 1
+#endif
/*
* Maximum size class that is a multiple of the quantum, but not (necessarily)
* a power of 2. Above this size, allocations are rounded up to the nearest
* power of 2.
*/
-#define SMALL_MAX_2POW_DEFAULT 9
-#define SMALL_MAX_DEFAULT (1U << SMALL_MAX_2POW_DEFAULT)
+#define QSPACE_MAX_2POW_DEFAULT 7
+
+/*
+ * Maximum size class that is a multiple of the cacheline, but not (necessarily)
+ * a power of 2. Above this size, allocations are rounded up to the nearest
+ * power of 2.
+ */
+#define CSPACE_MAX_2POW_DEFAULT 9
/*
* RUN_MAX_OVRHD indicates maximum desired run header overhead. Runs are sized
@@ -293,8 +361,7 @@
#define RUN_MAX_OVRHD_RELAX 0x00001800U
/* Put a cap on small object run size. This overrides RUN_MAX_OVRHD. */
-#define RUN_MAX_SMALL_2POW 15
-#define RUN_MAX_SMALL (1U << RUN_MAX_SMALL_2POW)
+#define RUN_MAX_SMALL (12 * pagesize)
/*
* Hyper-threaded CPUs may need a special instruction inside spin loops in
@@ -319,6 +386,15 @@
*/
#define BLOCK_COST_2POW 4
+#ifdef MALLOC_MAG
+ /*
+ * Default magazine size, in bytes. max_rounds is calculated to make
+ * optimal use of the space, leaving just enough room for the magazine
+ * header.
+ */
+# define MAG_SIZE_2POW_DEFAULT 9
+#endif
+
#ifdef MALLOC_BALANCE
/*
* We use an exponential moving average to track recent lock contention,
@@ -369,6 +445,11 @@
*/
uint64_t nrequests;
+#ifdef MALLOC_MAG
+ /* Number of magazine reloads from this bin. */
+ uint64_t nmags;
+#endif
+
/* Total number of runs created for this bin's size class. */
uint64_t nruns;
@@ -678,6 +759,35 @@
/******************************************************************************/
/*
+ * Magazine data structures.
+ */
+
+#ifdef MALLOC_MAG
+typedef struct mag_s mag_t;
+struct mag_s {
+ size_t binind; /* Index of associated bin. */
+ size_t nrounds;
+ void *rounds[1]; /* Dynamically sized. */
+};
+
+/*
+ * Magazines are lazily allocated, but once created, they remain until the
+ * associated mag_rack is destroyed.
+ */
+typedef struct bin_mags_s bin_mags_t;
+struct bin_mags_s {
+ mag_t *curmag;
+ mag_t *sparemag;
+};
+
+typedef struct mag_rack_s mag_rack_t;
+struct mag_rack_s {
+ bin_mags_t bin_mags[1]; /* Dynamically sized. */
+};
+#endif
+
+/******************************************************************************/
+/*
* Data.
*/
@@ -690,16 +800,147 @@
static size_t pagesize_2pow;
/* Various bin-related settings. */
-static size_t bin_maxclass; /* Max size class for bins. */
-static unsigned ntbins; /* Number of (2^n)-spaced tiny bins. */
+#ifdef MALLOC_TINY /* Number of (2^n)-spaced tiny bins. */
+# define ntbins ((unsigned)(QUANTUM_2POW - TINY_MIN_2POW))
+#else
+# define ntbins 0
+#endif
static unsigned nqbins; /* Number of quantum-spaced bins. */
-static unsigned nsbins; /* Number of (2^n)-spaced sub-page bins. */
-static size_t small_min;
-static size_t small_max;
+static unsigned ncbins; /* Number of cacheline-spaced bins. */
+static unsigned nsbins; /* Number of subpage-spaced bins. */
+static unsigned nbins;
+#ifdef MALLOC_TINY
+# define tspace_max ((size_t)(QUANTUM >> 1))
+#endif
+#define qspace_min QUANTUM
+static size_t qspace_max;
+static size_t cspace_min;
+static size_t cspace_max;
+static size_t sspace_min;
+static size_t sspace_max;
+#define bin_maxclass sspace_max
+
+static uint8_t const *size2bin;
+/*
+ * const_size2bin is a static constant lookup table that in the common case can
+ * be used as-is for size2bin. For dynamically linked programs, this avoids
+ * a page of memory overhead per process.
+ */
+#define S2B_1(i) i,
+#define S2B_2(i) S2B_1(i) S2B_1(i)
+#define S2B_4(i) S2B_2(i) S2B_2(i)
+#define S2B_8(i) S2B_4(i) S2B_4(i)
+#define S2B_16(i) S2B_8(i) S2B_8(i)
+#define S2B_32(i) S2B_16(i) S2B_16(i)
+#define S2B_64(i) S2B_32(i) S2B_32(i)
+#define S2B_128(i) S2B_64(i) S2B_64(i)
+#define S2B_256(i) S2B_128(i) S2B_128(i)
+static const uint8_t const_size2bin[(1U << PAGESIZE_2POW) - 255] = {
+ S2B_1(0xffU) /* 0 */
+#if (QUANTUM_2POW == 4)
+/* 64-bit system ************************/
+# ifdef MALLOC_TINY
+ S2B_2(0) /* 2 */
+ S2B_2(1) /* 4 */
+ S2B_4(2) /* 8 */
+ S2B_8(3) /* 16 */
+# define S2B_QMIN 3
+# else
+ S2B_16(0) /* 16 */
+# define S2B_QMIN 0
+# endif
+ S2B_16(S2B_QMIN + 1) /* 32 */
+ S2B_16(S2B_QMIN + 2) /* 48 */
+ S2B_16(S2B_QMIN + 3) /* 64 */
+ S2B_16(S2B_QMIN + 4) /* 80 */
+ S2B_16(S2B_QMIN + 5) /* 96 */
+ S2B_16(S2B_QMIN + 6) /* 112 */
+ S2B_16(S2B_QMIN + 7) /* 128 */
+# define S2B_CMIN (S2B_QMIN + 8)
+#else
+/* 32-bit system ************************/
+# ifdef MALLOC_TINY
+ S2B_2(0) /* 2 */
+ S2B_2(1) /* 4 */
+ S2B_4(2) /* 8 */
+# define S2B_QMIN 2
+# else
+ S2B_8(0) /* 8 */
+# define S2B_QMIN 0
+# endif
+ S2B_8(S2B_QMIN + 1) /* 16 */
+ S2B_8(S2B_QMIN + 2) /* 24 */
+ S2B_8(S2B_QMIN + 3) /* 32 */
+ S2B_8(S2B_QMIN + 4) /* 40 */
+ S2B_8(S2B_QMIN + 5) /* 48 */
+ S2B_8(S2B_QMIN + 6) /* 56 */
+ S2B_8(S2B_QMIN + 7) /* 64 */
+ S2B_8(S2B_QMIN + 8) /* 72 */
+ S2B_8(S2B_QMIN + 9) /* 80 */
+ S2B_8(S2B_QMIN + 10) /* 88 */
+ S2B_8(S2B_QMIN + 11) /* 96 */
+ S2B_8(S2B_QMIN + 12) /* 104 */
+ S2B_8(S2B_QMIN + 13) /* 112 */
+ S2B_8(S2B_QMIN + 14) /* 120 */
+ S2B_8(S2B_QMIN + 15) /* 128 */
+# define S2B_CMIN (S2B_QMIN + 16)
+#endif
+/****************************************/
+ S2B_64(S2B_CMIN + 0) /* 192 */
+ S2B_64(S2B_CMIN + 1) /* 256 */
+ S2B_64(S2B_CMIN + 2) /* 320 */
+ S2B_64(S2B_CMIN + 3) /* 384 */
+ S2B_64(S2B_CMIN + 4) /* 448 */
+ S2B_64(S2B_CMIN + 5) /* 512 */
+# define S2B_SMIN (S2B_CMIN + 6)
+ S2B_256(S2B_SMIN + 0) /* 768 */
+ S2B_256(S2B_SMIN + 1) /* 1024 */
+ S2B_256(S2B_SMIN + 2) /* 1280 */
+ S2B_256(S2B_SMIN + 3) /* 1536 */
+ S2B_256(S2B_SMIN + 4) /* 1792 */
+ S2B_256(S2B_SMIN + 5) /* 2048 */
+ S2B_256(S2B_SMIN + 6) /* 2304 */
+ S2B_256(S2B_SMIN + 7) /* 2560 */
+ S2B_256(S2B_SMIN + 8) /* 2816 */
+ S2B_256(S2B_SMIN + 9) /* 3072 */
+ S2B_256(S2B_SMIN + 10) /* 3328 */
+ S2B_256(S2B_SMIN + 11) /* 3584 */
+ S2B_256(S2B_SMIN + 12) /* 3840 */
+#if (PAGESIZE_2POW == 13)
+ S2B_256(S2B_SMIN + 13) /* 4096 */
+ S2B_256(S2B_SMIN + 14) /* 4352 */
+ S2B_256(S2B_SMIN + 15) /* 4608 */
+ S2B_256(S2B_SMIN + 16) /* 4864 */
+ S2B_256(S2B_SMIN + 17) /* 5120 */
+ S2B_256(S2B_SMIN + 18) /* 5376 */
+ S2B_256(S2B_SMIN + 19) /* 5632 */
+ S2B_256(S2B_SMIN + 20) /* 5888 */
+ S2B_256(S2B_SMIN + 21) /* 6144 */
+ S2B_256(S2B_SMIN + 22) /* 6400 */
+ S2B_256(S2B_SMIN + 23) /* 6656 */
+ S2B_256(S2B_SMIN + 24) /* 6912 */
+ S2B_256(S2B_SMIN + 25) /* 7168 */
+ S2B_256(S2B_SMIN + 26) /* 7424 */
+ S2B_256(S2B_SMIN + 27) /* 7680 */
+ S2B_256(S2B_SMIN + 28) /* 7936 */
+#endif
+};
+#undef S2B_1
+#undef S2B_2
+#undef S2B_4
+#undef S2B_8
+#undef S2B_16
+#undef S2B_32
+#undef S2B_64
+#undef S2B_128
+#undef S2B_256
+#undef S2B_QMIN
+#undef S2B_CMIN
+#undef S2B_SMIN
-/* Various quantum-related settings. */
-static size_t quantum;
-static size_t quantum_mask; /* (quantum - 1). */
+#ifdef MALLOC_MAG
+static size_t max_rounds;
+#endif
/* Various chunk-related settings. */
static size_t chunksize;
@@ -796,6 +1037,14 @@
static __thread arena_t *arenas_map;
#endif
+#ifdef MALLOC_MAG
+/*
+ * Map of thread-specific magazine racks, used for thread-specific object
+ * caching.
+ */
+static __thread mag_rack_t *mag_rack;
+#endif
+
#ifdef MALLOC_STATS
/* Chunk statistics. */
static chunk_stats_t stats_chunks;
@@ -818,13 +1067,17 @@
static bool opt_dss = true;
static bool opt_mmap = true;
#endif
+#ifdef MALLOC_MAG
+static bool opt_mag = true;
+static size_t opt_mag_size_2pow = MAG_SIZE_2POW_DEFAULT;
+#endif
static size_t opt_dirty_max = DIRTY_MAX_DEFAULT;
#ifdef MALLOC_BALANCE
static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
#endif
static bool opt_print_stats = false;
-static size_t opt_quantum_2pow = QUANTUM_2POW_MIN;
-static size_t opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
+static size_t opt_qspace_max_2pow = QSPACE_MAX_2POW_DEFAULT;
+static size_t opt_cspace_max_2pow = CSPACE_MAX_2POW_DEFAULT;
static size_t opt_chunk_2pow = CHUNK_2POW_DEFAULT;
static bool opt_utrace = false;
static bool opt_sysv = false;
@@ -902,15 +1155,21 @@
static void arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
-static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
-static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
+static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
+static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
#ifdef MALLOC_BALANCE
static void arena_lock_balance_hard(arena_t *arena);
#endif
+#ifdef MALLOC_MAG
+static void mag_load(mag_t *mag);
+#endif
static void *arena_malloc_large(arena_t *arena, size_t size, bool zero);
static void *arena_palloc(arena_t *arena, size_t alignment, size_t size,
size_t alloc_size);
static size_t arena_salloc(const void *ptr);
+#ifdef MALLOC_MAG
+static void mag_unload(mag_t *mag);
+#endif
static void arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk,
void *ptr);
static void arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
@@ -921,11 +1180,22 @@
static void *arena_ralloc(void *ptr, size_t size, size_t oldsize);
static bool arena_new(arena_t *arena);
static arena_t *arenas_extend(unsigned ind);
+#ifdef MALLOC_MAG
+static mag_t *mag_create(arena_t *arena, size_t binind);
+static void mag_destroy(mag_t *mag);
+static mag_rack_t *mag_rack_create(arena_t *arena);
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list