svn commit: r187681 - in head/sys: kern vm

Sun Jan 25 01:11:25 PST 2009

Author: jeff
Date: Sun Jan 25 09:11:24 2009
New Revision: 187681
URL: http://svn.freebsd.org/changeset/base/187681

Log:
   - Make the keg abstraction more complete.  Permit a zone to have multiple
     backend kegs so it may source compatible memory from multiple backends.
     This is useful for cases such as NUMA or different layouts for the same
     memory type.
   - Provide a new api for adding new backend kegs to secondary zones.
   - Provide a new flag for adjusting the layout of zones to stagger
     allocations better across cache lines.
  
  Sponsored by:	Nokia

Modified:
  head/sys/kern/kern_malloc.c
  head/sys/vm/uma.h
  head/sys/vm/uma_core.c
  head/sys/vm/uma_dbg.c
  head/sys/vm/uma_int.h

Modified: head/sys/kern/kern_malloc.c
==============================================================================

--- head/sys/kern/kern_malloc.c	Sun Jan 25 08:27:11 2009	(r187680)
+++ head/sys/kern/kern_malloc.c	Sun Jan 25 09:11:24 2009	(r187681)
@@ -329,7 +329,6 @@ malloc(unsigned long size, struct malloc
 	int indx;
 	caddr_t va;
 	uma_zone_t zone;
-	uma_keg_t keg;
 #if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
 	unsigned long osize = size;
 #endif
@@ -378,18 +377,16 @@ malloc(unsigned long size, struct malloc
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		zone = kmemzones[indx].kz_zone;
-		keg = zone->uz_keg;
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
 		va = uma_zalloc(zone, flags);
 		if (va != NULL)
-			size = keg->uk_size;
+			size = zone->uz_size;
 		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
-		keg = NULL;
 		va = uma_large_malloc(size, flags);
 		malloc_type_allocated(mtp, va == NULL ? 0 : size);
 	}

Modified: head/sys/vm/uma.h
==============================================================================
--- head/sys/vm/uma.h	Sun Jan 25 08:27:11 2009	(r187680)
+++ head/sys/vm/uma.h	Sun Jan 25 09:11:24 2009	(r187681)
@@ -205,6 +205,17 @@ uma_zone_t uma_zsecond_create(char *name
 		    uma_init zinit, uma_fini zfini, uma_zone_t master);
 
 /*
+ * Add a second master to a secondary zone.  This provides multiple data
+ * backends for objects with the same size.  Both masters must have
+ * compatible allocation flags.  Presently, UMA_ZONE_MALLOC type zones are
+ * the only supported.
+ *
+ * Returns:
+ * 	Error on failure, 0 on success.
+ */
+int uma_zsecond_add(uma_zone_t zone, uma_zone_t master);
+
+/*
  * Definitions for uma_zcreate flags
  *
  * These flags share space with UMA_ZFLAGs in uma_int.h.  Be careful not to
@@ -230,6 +241,22 @@ uma_zone_t uma_zsecond_create(char *name
 #define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
 #define	UMA_ZONE_REFCNT		0x0400	/* Allocate refcnts in slabs */
 #define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets */
+#define	UMA_ZONE_CACHESPREAD	0x1000	/*
+					 * Spread memory start locations across
+					 * all possible cache lines.  May
+					 * require many virtually contiguous
+					 * backend pages and can fail early.
+					 */
+#define	UMA_ZONE_VTOSLAB	0x2000	/* Zone uses vtoslab for lookup. */
+
+/*
+ * These flags are shared between the keg and zone.  In zones wishing to add
+ * new kegs these flags must be compatible.  Some are determined based on
+ * physical parameters of the request and may not be provided by the consumer.
+ */
+#define	UMA_ZONE_INHERIT						\
+    (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_HASH |		\
+    UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB)
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */

Modified: head/sys/vm/uma_core.c
==============================================================================
--- head/sys/vm/uma_core.c	Sun Jan 25 08:27:11 2009	(r187680)
+++ head/sys/vm/uma_core.c	Sun Jan 25 09:11:24 2009	(r187681)
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
+ * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff at FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
@@ -112,7 +112,7 @@ static uma_zone_t slabrefzone;	/* With r
 static uma_zone_t hashzone;
 
 /* The boot-time adjusted value for cache line alignment. */
-static int uma_align_cache = 16 - 1;
+static int uma_align_cache = 64 - 1;
 
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 
@@ -212,7 +212,7 @@ static void *obj_alloc(uma_zone_t, int, 
 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
 static void page_free(void *, int, u_int8_t);
-static uma_slab_t slab_zalloc(uma_zone_t, int);
+static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
@@ -221,8 +221,8 @@ static void keg_dtor(void *, int, void *
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static int zero_init(void *, int, int);
-static void zone_small_init(uma_zone_t zone);
-static void zone_large_init(uma_zone_t zone);
+static void keg_small_init(uma_keg_t keg);
+static void keg_large_init(uma_keg_t keg);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
 static int hash_alloc(struct uma_hash *);
@@ -230,19 +230,22 @@ static int hash_expand(struct uma_hash *
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
-static void *uma_zalloc_internal(uma_zone_t, void *, int);
-static void uma_zfree_internal(uma_zone_t, void *, void *, enum zfreeskip,
+static void *zone_alloc_item(uma_zone_t, void *, int);
+static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
     int);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(int, int);
 static void bucket_free(uma_bucket_t);
 static void bucket_zone_drain(void);
-static int uma_zalloc_bucket(uma_zone_t zone, int flags);
-static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
-static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
-static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
+static int zone_alloc_bucket(uma_zone_t zone, int flags);
+static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
+static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
+static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
+static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, u_int32_t flags);
+static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
+static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
@@ -291,7 +294,8 @@ bucket_init(void)
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
-		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
+		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+		    UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
 		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
 			bucket_size[i >> BUCKET_SHIFT] = j;
 	}
@@ -326,7 +330,7 @@ bucket_alloc(int entries, int bflags)
 		return (NULL);
 
 	ubz = bucket_zone_lookup(entries);
-	bucket = uma_zalloc_internal(ubz->ubz_zone, NULL, bflags);
+	bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
@@ -344,7 +348,7 @@ bucket_free(uma_bucket_t bucket)
 	struct uma_bucket_zone *ubz;
 
 	ubz = bucket_zone_lookup(bucket->ub_entries);
-	uma_zfree_internal(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
+	zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
 	    ZFREE_STATFREE);
 }
 
@@ -357,6 +361,21 @@ bucket_zone_drain(void)
 		zone_drain(ubz->ubz_zone);
 }
 
+static inline uma_keg_t
+zone_first_keg(uma_zone_t zone)
+{
+
+	return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
+}
+
+static void
+zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
+{
+	uma_klink_t klink;
+
+	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
+		kegfn(klink->kl_keg);
+}
 
 /*
  * Routine called by timeout which is used to fire off some time interval
@@ -382,29 +401,20 @@ uma_timeout(void *unused)
  * Routine to perform timeout driven calculations.  This expands the
  * hashes and does per cpu statistics aggregation.
  *
- *  Arguments:
- *	zone  The zone to operate on
- *
- *  Returns:
- *	Nothing
+ *  Returns nothing.
  */
 static void
-zone_timeout(uma_zone_t zone)
+keg_timeout(uma_keg_t keg)
 {
-	uma_keg_t keg;
-	u_int64_t alloc;
-
-	keg = zone->uz_keg;
-	alloc = 0;
 
+	KEG_LOCK(keg);
 	/*
-	 * Expand the zone hash table.
+	 * Expand the keg hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
-	ZONE_LOCK(zone);
 	if (keg->uk_flags & UMA_ZONE_HASH &&
 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
@@ -413,14 +423,14 @@ zone_timeout(uma_zone_t zone)
 
 		/*
 		 * This is so involved because allocating and freeing
-		 * while the zone lock is held will lead to deadlock.
+		 * while the keg lock is held will lead to deadlock.
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
 		newhash = keg->uk_hash;
-		ZONE_UNLOCK(zone);
+		KEG_UNLOCK(keg);
 		ret = hash_alloc(&newhash);
-		ZONE_LOCK(zone);
+		KEG_LOCK(keg);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
 				oldhash = keg->uk_hash;
@@ -428,12 +438,19 @@ zone_timeout(uma_zone_t zone)
 			} else
 				oldhash = newhash;
 
-			ZONE_UNLOCK(zone);
+			KEG_UNLOCK(keg);
 			hash_free(&oldhash);
-			ZONE_LOCK(zone);
+			KEG_LOCK(keg);
 		}
 	}
-	ZONE_UNLOCK(zone);
+	KEG_UNLOCK(keg);
+}
+
+static void
+zone_timeout(uma_zone_t zone)
+{
+
+	zone_foreach_keg(zone, &keg_timeout);
 }
 
 /*
@@ -462,7 +479,7 @@ hash_alloc(struct uma_hash *hash)
 		    M_UMAHASH, M_NOWAIT);
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
-		hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL,
+		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 		    M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
@@ -535,7 +552,7 @@ hash_free(struct uma_hash *hash)
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
-		uma_zfree_internal(hashzone,
+		zone_free_item(hashzone,
 		    hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
@@ -555,20 +572,11 @@ hash_free(struct uma_hash *hash)
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
-	uma_slab_t slab;
-	int mzone;
 	void *item;
 
 	if (bucket == NULL)
 		return;
 
-	slab = NULL;
-	mzone = 0;
-
-	/* We have to lookup the slab again for malloc.. */
-	if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC)
-		mzone = 1;
-
 	while (bucket->ub_cnt > 0)  {
 		bucket->ub_cnt--;
 		item = bucket->ub_bucket[bucket->ub_cnt];
@@ -577,15 +585,7 @@ bucket_drain(uma_zone_t zone, uma_bucket
 		KASSERT(item != NULL,
 		    ("bucket_drain: botched ptr, item is NULL"));
 #endif
-		/*
-		 * This is extremely inefficient.  The slab pointer was passed
-		 * to uma_zfree_arg, but we lost it because the buckets don't
-		 * hold them.  This will go away when free() gets a size passed
-		 * to it.
-		 */
-		if (mzone)
-			slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
-		uma_zfree_internal(zone, item, slab, SKIP_DTOR, 0);
+		zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
 	}
 }
 
@@ -665,42 +665,32 @@ bucket_cache_drain(uma_zone_t zone)
 }
 
 /*
- * Frees pages from a zone back to the system.  This is done on demand from
+ * Frees pages from a keg back to the system.  This is done on demand from
  * the pageout daemon.
  *
- * Arguments:
- *	zone  The zone to free pages from
- *	 all  Should we drain all items?
- *
- * Returns:
- *	Nothing.
+ * Returns nothing.
  */
-void
-zone_drain(uma_zone_t zone)
+static void
+keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
-	uma_keg_t keg;
 	uma_slab_t slab;
 	uma_slab_t n;
 	u_int8_t flags;
 	u_int8_t *mem;
 	int i;
 
-	keg = zone->uz_keg;
-
 	/*
-	 * We don't want to take pages from statically allocated zones at this
+	 * We don't want to take pages from statically allocated kegs at this
 	 * time
 	 */
 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
-	ZONE_LOCK(zone);
-
 #ifdef UMA_DEBUG
-	printf("%s free items: %u\n", zone->uz_name, keg->uk_free);
+	printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
 #endif
-	bucket_cache_drain(zone);
+	KEG_LOCK(keg);
 	if (keg->uk_free == 0)
 		goto finished;
 
@@ -726,7 +716,7 @@ zone_drain(uma_zone_t zone)
 		slab = n;
 	}
 finished:
-	ZONE_UNLOCK(zone);
+	KEG_UNLOCK(keg);
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
@@ -738,8 +728,7 @@ finished:
 		flags = slab->us_flags;
 		mem = slab->us_data;
 
-		if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
-		    (keg->uk_flags & UMA_ZONE_REFCNT)) {
+		if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 			vm_object_t obj;
 
 			if (flags & UMA_SLAB_KMEM)
@@ -753,21 +742,61 @@ finished:
 				    obj);
 		}
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
-			uma_zfree_internal(keg->uk_slabzone, slab, NULL,
+			zone_free_item(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
 #ifdef UMA_DEBUG
 		printf("%s: Returning %d bytes.\n",
-		    zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);
+		    keg->uk_name, UMA_SLAB_SIZE * keg->uk_ppera);
 #endif
 		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
 	}
 }
 
+static void
+zone_drain_wait(uma_zone_t zone, int waitok)
+{
+
+	/*
+	 * Set draining to interlock with zone_dtor() so we can release our
+	 * locks as we go.  Only dtor() should do a WAITOK call since it
+	 * is the only call that knows the structure will still be available
+	 * when it wakes up.
+	 */
+	ZONE_LOCK(zone);
+	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
+		if (waitok == M_NOWAIT)
+			goto out;
+		mtx_unlock(&uma_mtx);
+		msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
+		mtx_lock(&uma_mtx);
+	}
+	zone->uz_flags |= UMA_ZFLAG_DRAINING;
+	bucket_cache_drain(zone);
+	ZONE_UNLOCK(zone);
+	/*
+	 * The DRAINING flag protects us from being freed while
+	 * we're running.  Normally the uma_mtx would protect us but we
+	 * must be able to release and acquire the right lock for each keg.
+	 */
+	zone_foreach_keg(zone, &keg_drain);
+	ZONE_LOCK(zone);
+	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
+	wakeup(zone);
+out:
+	ZONE_UNLOCK(zone);
+}
+
+void
+zone_drain(uma_zone_t zone)
+{
+
+	zone_drain_wait(zone, M_NOWAIT);
+}
+
 /*
- * Allocate a new slab for a zone.  This does not insert the slab onto a list.
+ * Allocate a new slab for a keg.  This does not insert the slab onto a list.
  *
  * Arguments:
- *	zone  The zone to allocate slabs for
  *	wait  Shall we wait?
  *
  * Returns:
@@ -775,27 +804,28 @@ finished:
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
-slab_zalloc(uma_zone_t zone, int wait)
+keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 {
 	uma_slabrefcnt_t slabref;
+	uma_alloc allocf;
 	uma_slab_t slab;
-	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t flags;
 	int i;
 
+	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
-	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG
-	printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
+	printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
 #endif
-	ZONE_UNLOCK(zone);
+	allocf = keg->uk_allocf;
+	KEG_UNLOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
-		slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);
+		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
 		if (slab == NULL) {
-			ZONE_LOCK(zone);
+			KEG_LOCK(keg);
 			return NULL;
 		}
 	}
@@ -812,13 +842,13 @@ slab_zalloc(uma_zone_t zone, int wait)
 	else
 		wait &= ~M_ZERO;
 
-	mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
-	    &flags, wait);
+	/* zone is passed for legacy reasons. */
+	mem = allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
-			uma_zfree_internal(keg->uk_slabzone, slab, NULL,
+			zone_free_item(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
-		ZONE_LOCK(zone);
+		KEG_LOCK(keg);
 		return (NULL);
 	}
 
@@ -826,8 +856,7 @@ slab_zalloc(uma_zone_t zone, int wait)
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
-	if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
-	    (keg->uk_flags & UMA_ZONE_REFCNT))
+	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
@@ -860,8 +889,7 @@ slab_zalloc(uma_zone_t zone, int wait)
 					    (keg->uk_rsize * i),
 					    keg->uk_size);
 			}
-			if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
-			    (keg->uk_flags & UMA_ZONE_REFCNT)) {
+			if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 				vm_object_t obj;
 
 				if (flags & UMA_SLAB_KMEM)
@@ -875,15 +903,15 @@ slab_zalloc(uma_zone_t zone, int wait)
 					    (i * PAGE_SIZE), obj);
 			}
 			if (keg->uk_flags & UMA_ZONE_OFFPAGE)
-				uma_zfree_internal(keg->uk_slabzone, slab,
+				zone_free_item(keg->uk_slabzone, slab,
 				    NULL, SKIP_NONE, ZFREE_STATFREE);
 			keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
 			    flags);
-			ZONE_LOCK(zone);
+			KEG_LOCK(keg);
 			return (NULL);
 		}
 	}
-	ZONE_LOCK(zone);
+	KEG_LOCK(keg);
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
@@ -905,7 +933,7 @@ startup_alloc(uma_zone_t zone, int bytes
 	uma_keg_t keg;
 	uma_slab_t tmps;
 
-	keg = zone->uz_keg;
+	keg = zone_first_keg(zone);
 
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
@@ -935,7 +963,6 @@ startup_alloc(uma_zone_t zone, int bytes
  * Allocates a number of pages from the system
  *
  * Arguments:
- *	zone  Unused
  *	bytes  The number of bytes requested
  *	wait  Shall we wait?
  *
@@ -958,7 +985,6 @@ page_alloc(uma_zone_t zone, int bytes, u
  * Allocates a number of pages from within an object
  *
  * Arguments:
- *	zone   Unused
  *	bytes  The number of bytes requested
  *	wait   Shall we wait?
  *
@@ -973,8 +999,10 @@ obj_alloc(uma_zone_t zone, int bytes, u_
 	vm_offset_t retkva, zkva;
 	vm_page_t p;
 	int pages, startpages;
+	uma_keg_t keg;
 
-	object = zone->uz_keg->uk_obj;
+	keg = zone_first_keg(zone);
+	object = keg->uk_obj;
 	retkva = 0;
 
 	/*
@@ -984,7 +1012,7 @@ obj_alloc(uma_zone_t zone, int bytes, u_
 	p = TAILQ_LAST(&object->memq, pglist);
 	pages = p != NULL ? p->pindex + 1 : 0;
 	startpages = pages;
-	zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE;
+	zkva = keg->uk_kva + pages * PAGE_SIZE;
 	for (; bytes > 0; bytes -= PAGE_SIZE) {
 		p = vm_page_alloc(object, pages,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
@@ -1052,25 +1080,23 @@ zero_init(void *mem, int size, int flags
 }
 
 /*
- * Finish creating a small uma zone.  This calculates ipers, and the zone size.
+ * Finish creating a small uma keg.  This calculates ipers, and the keg size.
  *
  * Arguments
- *	zone  The zone we should initialize
+ *	keg  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
-zone_small_init(uma_zone_t zone)
+keg_small_init(uma_keg_t keg)
 {
-	uma_keg_t keg;
 	u_int rsize;
 	u_int memused;
 	u_int wastedspace;
 	u_int shsize;
 
-	keg = zone->uz_keg;
-	KASSERT(keg != NULL, ("Keg is null in zone_small_init"));
+	KASSERT(keg != NULL, ("Keg is null in keg_small_init"));
 	rsize = keg->uk_size;
 
 	if (rsize < UMA_SMALLEST_UNIT)
@@ -1090,7 +1116,7 @@ zone_small_init(uma_zone_t zone)
 	}
 
 	keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
-	KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0"));
+	KASSERT(keg->uk_ipers != 0, ("keg_small_init: ipers is 0"));
 	memused = keg->uk_ipers * rsize + shsize;
 	wastedspace = UMA_SLAB_SIZE - memused;
 
@@ -1109,44 +1135,41 @@ zone_small_init(uma_zone_t zone)
 	    (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
 		keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
 		KASSERT(keg->uk_ipers <= 255,
-		    ("zone_small_init: keg->uk_ipers too high!"));
+		    ("keg_small_init: keg->uk_ipers too high!"));
 #ifdef UMA_DEBUG
 		printf("UMA decided we need offpage slab headers for "
-		    "zone: %s, calculated wastedspace = %d, "
+		    "keg: %s, calculated wastedspace = %d, "
 		    "maximum wasted space allowed = %d, "
 		    "calculated ipers = %d, "
-		    "new wasted space = %d\n", zone->uz_name, wastedspace,
+		    "new wasted space = %d\n", keg->uk_name, wastedspace,
 		    UMA_MAX_WASTE, keg->uk_ipers,
 		    UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
 #endif
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
-		if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+		if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 			keg->uk_flags |= UMA_ZONE_HASH;
 	}
 }
 
 /*
- * Finish creating a large (> UMA_SLAB_SIZE) uma zone.  Just give in and do
+ * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
  * more complicated.
  *
  * Arguments
- *	zone  The zone we should initialize
+ *	keg  The keg we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
-zone_large_init(uma_zone_t zone)
+keg_large_init(uma_keg_t keg)
 {
-	uma_keg_t keg;
 	int pages;
 
-	keg = zone->uz_keg;
-
-	KASSERT(keg != NULL, ("Keg is null in zone_large_init"));
+	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
-	    ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone"));
+	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
 
 	pages = keg->uk_size / UMA_SLAB_SIZE;
 
@@ -1158,12 +1181,44 @@ zone_large_init(uma_zone_t zone)
 	keg->uk_ipers = 1;
 
 	keg->uk_flags |= UMA_ZONE_OFFPAGE;
-	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
+	if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 
 	keg->uk_rsize = keg->uk_size;
 }
 
+static void
+keg_cachespread_init(uma_keg_t keg)
+{
+	int alignsize;
+	int trailer;
+	int pages;
+	int rsize;
+
+	alignsize = keg->uk_align + 1;
+	rsize = keg->uk_size;
+	/*
+	 * We want one item to start on every align boundary in a page.  To
+	 * do this we will span pages.  We will also extend the item by the
+	 * size of align if it is an even multiple of align.  Otherwise, it
+	 * would fall on the same boundary every time.
+	 */
+	if (rsize & keg->uk_align)
+		rsize = (rsize & ~keg->uk_align) + alignsize;
+	if ((rsize & alignsize) == 0)
+		rsize += alignsize;
+	trailer = rsize - keg->uk_size;
+	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
+	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
+	keg->uk_rsize = rsize;
+	keg->uk_ppera = pages;
+	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
+	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
+	KASSERT(keg->uk_ipers <= uma_max_ipers,
+	    ("keg_small_init: keg->uk_ipers too high(%d) increase max_ipers",
+	    keg->uk_ipers));
+}
+
 /*
  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
  * the keg onto the global keg list.
@@ -1195,7 +1250,7 @@ keg_ctor(void *mem, int size, void *udat
 	 * The master zone is passed to us at keg-creation time.
 	 */
 	zone = arg->zone;
-	zone->uz_keg = keg;
+	keg->uk_name = zone->uz_name;
 
 	if (arg->flags & UMA_ZONE_VM)
 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
@@ -1203,24 +1258,31 @@ keg_ctor(void *mem, int size, void *udat
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
+	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
+		keg->uk_flags |= UMA_ZONE_VTOSLAB;
+
 	/*
 	 * The +UMA_FRITM_SZ added to uk_size is to account for the
-	 * linkage that is added to the size in zone_small_init().  If
+	 * linkage that is added to the size in keg_small_init().  If
 	 * we don't account for this here then we may end up in
-	 * zone_small_init() with a calculated 'ipers' of 0.
+	 * keg_small_init() with a calculated 'ipers' of 0.
 	 */
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		if ((keg->uk_size+UMA_FRITMREF_SZ) >
+		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
+			keg_cachespread_init(keg);
+		else if ((keg->uk_size+UMA_FRITMREF_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
-			zone_large_init(zone);
+			keg_large_init(keg);
 		else
-			zone_small_init(zone);
+			keg_small_init(keg);
 	} else {
-		if ((keg->uk_size+UMA_FRITM_SZ) >
+		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
+			keg_cachespread_init(keg);
+		else if ((keg->uk_size+UMA_FRITM_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
-			zone_large_init(zone);
+			keg_large_init(keg);
 		else
-			zone_small_init(zone);
+			keg_small_init(keg);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
@@ -1244,14 +1306,12 @@ keg_ctor(void *mem, int size, void *udat
 	}
 
 	/*
-	 * Initialize keg's lock (shared among zones) through
-	 * Master zone
+	 * Initialize keg's lock (shared among zones).
 	 */
-	zone->uz_lock = &keg->uk_lock;
 	if (arg->flags & UMA_ZONE_MTXCLASS)
-		ZONE_LOCK_INIT(zone, 1);
+		KEG_LOCK_INIT(keg, 1);
 	else
-		ZONE_LOCK_INIT(zone, 0);
+		KEG_LOCK_INIT(keg, 0);
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
@@ -1300,10 +1360,10 @@ keg_ctor(void *mem, int size, void *udat
 		hash_alloc(&keg->uk_hash);
 
 #ifdef UMA_DEBUG
-	printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
-	    zone->uz_name, zone,
-	    keg->uk_size, keg->uk_ipers,
-	    keg->uk_ppera, keg->uk_pgoff);
+	printf("UMA: %s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
+	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
+	    keg->uk_ipers, keg->uk_ppera,
+	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
 #endif
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
@@ -1320,7 +1380,6 @@ keg_ctor(void *mem, int size, void *udat
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_zctor_args
  */
-
 static int
 zone_ctor(void *mem, int size, void *udata, int flags)
 {
@@ -1333,23 +1392,24 @@ zone_ctor(void *mem, int size, void *uda
 	zone->uz_name = arg->name;
 	zone->uz_ctor = arg->ctor;
 	zone->uz_dtor = arg->dtor;
+	zone->uz_slab = zone_fetch_slab;
 	zone->uz_init = NULL;
 	zone->uz_fini = NULL;
 	zone->uz_allocs = 0;
 	zone->uz_frees = 0;
 	zone->uz_fails = 0;
 	zone->uz_fills = zone->uz_count = 0;
+	zone->uz_flags = 0;
+	keg = arg->keg;
 
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
-		keg = arg->keg;
-		zone->uz_keg = keg;
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
 		zone->uz_lock = &keg->uk_lock;
+		zone->uz_flags |= UMA_ZONE_SECONDARY;
 		mtx_lock(&uma_mtx);
 		ZONE_LOCK(zone);
-		keg->uk_flags |= UMA_ZONE_SECONDARY;
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
 				LIST_INSERT_AFTER(z, zone, uz_link);
@@ -1358,9 +1418,9 @@ zone_ctor(void *mem, int size, void *uda
 		}
 		ZONE_UNLOCK(zone);
 		mtx_unlock(&uma_mtx);
-	} else if (arg->keg == NULL) {
-		if (uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
-		    arg->align, arg->flags) == NULL)
+	} else if (keg == NULL) {
+		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
+		    arg->align, arg->flags)) == NULL)
 			return (ENOMEM);
 	} else {
 		struct uma_kctor_args karg;
@@ -1378,15 +1438,22 @@ zone_ctor(void *mem, int size, void *uda
 		if (error)
 			return (error);
 	}
-	keg = zone->uz_keg;
+	/*
+	 * Link in the first keg.
+	 */
+	zone->uz_klink.kl_keg = keg;
+	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
 	zone->uz_lock = &keg->uk_lock;
+	zone->uz_size = keg->uk_size;
+	zone->uz_flags |= (keg->uk_flags &
+	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
-		KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0,
+		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return (0);
 	}
@@ -1413,18 +1480,17 @@ keg_dtor(void *arg, int size, void *udat
 	uma_keg_t keg;
 
 	keg = (uma_keg_t)arg;
-	mtx_lock(&keg->uk_lock);
+	KEG_LOCK(keg);
 	if (keg->uk_free != 0) {
 		printf("Freed UMA keg was not empty (%d items). "
 		    " Lost %d pages of memory.\n",
 		    keg->uk_free, keg->uk_pages);
 	}
-	mtx_unlock(&keg->uk_lock);
+	KEG_UNLOCK(keg);
 
-	if (keg->uk_flags & UMA_ZONE_HASH)
-		hash_free(&keg->uk_hash);
+	hash_free(&keg->uk_hash);
 
-	mtx_destroy(&keg->uk_lock);
+	KEG_LOCK_FINI(keg);
 }
 
 /*
@@ -1436,38 +1502,46 @@ keg_dtor(void *arg, int size, void *udat
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
+	uma_klink_t klink;
 	uma_zone_t zone;
 	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
-	keg = zone->uz_keg;
+	keg = zone_first_keg(zone);
 
-	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL))
+	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
 	mtx_lock(&uma_mtx);
-	zone_drain(zone);
-	if (keg->uk_flags & UMA_ZONE_SECONDARY) {
-		LIST_REMOVE(zone, uz_link);
-		/*
-		 * XXX there are some races here where
-		 * the zone can be drained but zone lock
-		 * released and then refilled before we
-		 * remove it... we dont care for now
-		 */
-		ZONE_LOCK(zone);
-		if (LIST_EMPTY(&keg->uk_zones))
-			keg->uk_flags &= ~UMA_ZONE_SECONDARY;
-		ZONE_UNLOCK(zone);
-		mtx_unlock(&uma_mtx);
-	} else {
+	LIST_REMOVE(zone, uz_link);
+	mtx_unlock(&uma_mtx);
+	/*
+	 * XXX there are some races here where
+	 * the zone can be drained but zone lock
+	 * released and then refilled before we
+	 * remove it... we dont care for now
+	 */
+	zone_drain_wait(zone, M_WAITOK);
+	/*
+	 * Unlink all of our kegs.
+	 */
+	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
+		klink->kl_keg = NULL;
+		LIST_REMOVE(klink, kl_link);
+		if (klink == &zone->uz_klink)
+			continue;
+		free(klink, M_TEMP);
+	}
+	/*
+	 * We only destroy kegs from non secondary zones.
+	 */
+	if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
+		mtx_lock(&uma_mtx);
 		LIST_REMOVE(keg, uk_link);
-		LIST_REMOVE(zone, uz_link);
 		mtx_unlock(&uma_mtx);
-		uma_zfree_internal(kegs, keg, NULL, SKIP_NONE,
+		zone_free_item(kegs, keg, NULL, SKIP_NONE,
 		    ZFREE_STATFREE);
 	}
-	zone->uz_keg = NULL;
 }
 
 /*
@@ -1517,7 +1591,7 @@ uma_startup(void *bootmem, int boot_page
 	 * (UMA_MAX_WASTE).
 	 *
 	 * We iterate until we find an object size for
-	 * which the calculated wastage in zone_small_init() will be
+	 * which the calculated wastage in keg_small_init() will be
 	 * enough to warrant OFFPAGE.  Since wastedspace versus objsize
 	 * is an overall increasing see-saw function, we find the smallest
 	 * objsize such that the wastage is always acceptable for objects
@@ -1525,7 +1599,7 @@ uma_startup(void *bootmem, int boot_page
 	 * generates a larger possible uma_max_ipers, we use this computed
 	 * objsize to calculate the largest ipers possible.  Since the
 	 * ipers calculated for OFFPAGE slab headers is always larger than
-	 * the ipers initially calculated in zone_small_init(), we use
+	 * the ipers initially calculated in keg_small_init(), we use
 	 * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
 	 * obtain the maximum ipers possible for offpage slab headers.
 	 *
@@ -1557,7 +1631,7 @@ uma_startup(void *bootmem, int boot_page
 	}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***