git: aa6b871ea77e - main - arm64: Add support to vchiq and bcm2835_audio (plus some fixes)

From: Adrian Chadd <adrian_at_FreeBSD.org>
Date: Sat, 15 Nov 2025 03:28:08 UTC
The branch main has been updated by adrian:

URL: https://cgit.FreeBSD.org/src/commit/?id=aa6b871ea77e5b52cf4683c5f304a82d2e351ba0

commit aa6b871ea77e5b52cf4683c5f304a82d2e351ba0
Author:     Marco Devesas Campos <devesas.campos@gmail.com>
AuthorDate: 2025-10-20 02:50:19 +0000
Commit:     Adrian Chadd <adrian@FreeBSD.org>
CommitDate: 2025-11-15 03:27:46 +0000

    arm64: Add support to vchiq and bcm2835_audio (plus some fixes)
    
    Add 64 bit support to vchiq:
    
     * update fields to the appropriate fixed bit-size variants (everywhere [cf. e.g., ref:sizes and ref:sizes2])
     * refer to event semaphores (that go into the very 32 bit VC) by offset instead of pointers [ref:sems]
     * dsb() is dsb(sy) in arm64 (vchiq_{core.c,core.h,kmod.c}) [ref:dsb]
     * comment out some unneeded code in parse_rx_slots around VCHIQ_MSG_BULK_RX (cf. [ref:deadcode])
     * adapt remote_event_signal to arm64 caching behaviours (vchiq_kmod.c)
     * refactor synchronization around remote_event_signal, forcing a wmb to be on the safe side; thereby make it look more like what linux does [ref:sync] (vchiq_{core,kmod}.c); and make a comment in vchiq_core.c true (wasn't before)
     * add a few more syncs to be on the safe side (vchiq_2835_arm.c)
     * use arm64 dcache invalidation mechanisms (vchiq_2835_arm.c)
     * explicitly invalidate pages on arm64 post bulk-read (vchiq_2835_arm.c)
     * support bulk transfers on rpi-4 (aka "long address space" transfers), by hard-coding their vc offset (0) and different bit-shift [ref:longbulk] (vchiq_2835_arm.c)
     * refactor a loop-of-constant-test (vchiq_2835_arm.c)
     * use the correct (hard-coded) cache-line size on arm64
     * rework the handling of chipset "features" to account for the extra behaviours with 64 bit chipsets. (vchiq_kmod.c)
     * add sysctl-s (log, arm_log) to control debug (vchiq_kmod.c)
     * add example kernel config (GENERIC-VCHIQ)
    
    Fixes:
    
     * Rework error handling in create_pagelist, avoiding a potential panic when
       freeing memory that had been dmamem_alloc, a potential null dereference,
       and a leak when having problems pinning pages (vchiq_2835_arm.c)
     * fix a confusion about the behaviour cv_wait_sig that lead to
       uninterruptible looping (vchiq_bsd.c)
     * implement detection of fatal signals (vchiq_bsd.c)
     * fix a confusion with the name of a variable introduced by #a0b8746
       that could lead to a panic when closing the cdev file (vchiq_arm.c)
     * release user connection when destructing cdevpriv and avoid user processes
       sharing connection data, which lead to stalls and data corruption. (vchiq_arm.c)
    
    Update bcm2835_audio to work on 64bit systems:
    
     * update VC audio fields (vc_vchi_audioserv_defs.h, bcm2835_audio.c)
     * repurpose the hitherto unused callback field to help push a 64 bit pointer in (bcm2835_audio.c)
     * increase (hopefully) the robustness of the code that shifts data to VC (bcm2835_audio.c)
     * add a sysctl to control the amount of debugging info output by bcm2835_audio.c
    
    Tested on zero, zero2 and 4+ with ping, functional, bulk and control vchiq_test-s, and omxplayer
    
      [ref:dsb]: https://github.com/raspberrypi/linux/commit/35b7ebda57affcfd3616d39d5a727a4495b31123
      [ref:sems]: https://github.com/raspberrypi/linux/commit/24a4262afb10907fce3cdbc3ae336fcf4cdaece5
      [ref:sizes]: https://github.com/raspberrypi/linux/commit/e64568b8ea6c04e747e432c17ce2452652075216
      [ref:sizes2]: https://github.com/raspberrypi/linux/commit/f9bee6dd24addfa00c2c8d50c25b73efbfbb28ba
      [ref:deadcode]: https://github.com/raspberrypi/linux/commit/14f4d72fb799a9b3170a45ab80d4a3ddad541960
      [ref:sync]: https://github.com/raspberrypi/linux/commit/51c071265079319583e4c6e8c61e09660300d0bf
      [ref:longbulk]: https://github.com/raspberrypi/linux/commit/37f6f19a83722c9b866cecb5e455b2e16e5bbc6b
    
    Differential Revision:  https://reviews.freebsd.org/D37878
    Submitted by: Marco Devesas Campos <devesas.campos@gmail.com>
---
 sys/arm/broadcom/bcm2835/bcm2835_audio.c           | 152 +++++++++++--
 sys/arm/broadcom/bcm2835/vc_vchi_audioserv_defs.h  |   8 +-
 sys/arm64/conf/std.broadcom                        |   3 +
 sys/contrib/vchiq/interface/compat/vchi_bsd.c      |  12 +-
 .../vchiq/interface/vchiq_arm/vchiq_2835_arm.c     | 145 +++++++++++--
 sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c  | 235 +++++++++++----------
 sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c |  91 ++++----
 sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.h |  11 +-
 sys/contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c |  76 ++++++-
 .../vchiq/interface/vchiq_arm/vchiq_pagelist.h     |   8 +-
 sys/contrib/vchiq/interface/vchiq_arm/vchiq_shim.c |   4 +-
 11 files changed, 531 insertions(+), 214 deletions(-)

diff --git a/sys/arm/broadcom/bcm2835/bcm2835_audio.c b/sys/arm/broadcom/bcm2835/bcm2835_audio.c
index 06bbc67bd7bd..1406fcc3d952 100644
--- a/sys/arm/broadcom/bcm2835/bcm2835_audio.c
+++ b/sys/arm/broadcom/bcm2835/bcm2835_audio.c
@@ -113,6 +113,12 @@ struct bcm2835_audio_chinfo {
 	uint64_t retrieved_samples;
 	uint64_t underruns;
 	int starved;
+	struct bcm_log_vars {
+		unsigned int bsize ;
+		int slept_for_lack_of_space ;
+	} log_vars;
+#define DEFAULT_LOG_VALUES \
+	((struct bcm_log_vars) { .bsize = 0 , .slept_for_lack_of_space = 0 })
 };
 
 struct bcm2835_audio_info {
@@ -164,6 +170,10 @@ struct bcm2835_audio_info {
 			device_printf((sc)->dev, __VA_ARGS__);	\
 	} while(0)
 
+/* Useful for circular buffer calcs */
+#define MOD_DIFF(front,rear,mod) (((mod) + (front) - (rear)) % (mod))
+
+
 static const char *
 dest_description(uint32_t dest)
 {
@@ -237,10 +247,21 @@ bcm2835_audio_callback(void *param, const VCHI_CALLBACK_REASON_T reason, void *m
 			    m.type);
 		}
 	} else if (m.type == VC_AUDIO_MSG_TYPE_COMPLETE) {
-		struct bcm2835_audio_chinfo *ch = m.u.complete.cookie;
+		unsigned int signaled = 0;
+		struct bcm2835_audio_chinfo *ch ;
+#if defined(__aarch64__)
+		ch = (void *) ((((size_t)m.u.complete.callback) << 32)
+		    | ((size_t)m.u.complete.cookie));
+#else
+		ch = (void *) (m.u.complete.cookie);
+#endif
 
 		int count = m.u.complete.count & 0xffff;
 		int perr = (m.u.complete.count & (1U << 30)) != 0;
+
+		BCM2835_LOG_TRACE(sc, "in:: count:0x%x perr:%d\n",
+		    m.u.complete.count, perr);
+
 		ch->callbacks++;
 		if (perr)
 			ch->underruns++;
@@ -264,13 +285,31 @@ bcm2835_audio_callback(void *param, const VCHI_CALLBACK_REASON_T reason, void *m
 					    (uintmax_t)ch->retrieved_samples,
 					    (uintmax_t)ch->submitted_samples);
 				}
-				ch->available_space += count;
-				ch->retrieved_samples += count;
 			}
-			if (perr || (ch->available_space >= VCHIQ_AUDIO_PACKET_SIZE))
-				cv_signal(&sc->worker_cv);
+			ch->available_space += count;
+			ch->retrieved_samples += count;
+			/*
+			 *  XXXMDC
+			 *  Experimental: if VC says it's empty, believe it
+			 *  Has to come after the usual adjustments
+			 */
+			if(perr){
+				ch->available_space = VCHIQ_AUDIO_BUFFER_SIZE;
+				perr = ch->retrieved_samples; // shd be != 0
+			}
+
+			if ((ch->available_space >= 1*VCHIQ_AUDIO_PACKET_SIZE)){
+					cv_signal(&sc->worker_cv);
+				signaled = 1;
+			}
 		}
 		BCM2835_AUDIO_UNLOCK(sc);
+		if(perr){
+			BCM2835_LOG_WARN(sc,
+			    "VC starved; reported %u for a total of %u\n"
+			    "worker %s\n", count, perr,
+			    (signaled ? "signaled": "not signaled"));
+		}
 	} else
 		BCM2835_LOG_WARN(sc, "%s: unknown m.type: %d\n", __func__,
 		    m.type);
@@ -371,6 +410,7 @@ bcm2835_audio_stop(struct bcm2835_audio_chinfo *ch)
 		m.type = VC_AUDIO_MSG_TYPE_STOP;
 		m.u.stop.draining = 0;
 
+		BCM2835_LOG_INFO(sc,"sending stop\n");
 		ret = vchi_msg_queue(sc->vchi_handle,
 		    &m, sizeof m, VCHI_FLAGS_BLOCK_UNTIL_QUEUED, NULL);
 
@@ -449,18 +489,25 @@ static bool
 bcm2835_audio_buffer_should_sleep(struct bcm2835_audio_chinfo *ch)
 {
 
+	ch->log_vars.slept_for_lack_of_space = 0;
 	if (ch->playback_state != PLAYBACK_PLAYING)
 		return (true);
 
 	/* Not enough data */
-	if (sndbuf_getready(ch->buffer) < VCHIQ_AUDIO_PACKET_SIZE) {
-		printf("starve\n");
+	/* XXXMDC Take unsubmitted stuff into account */
+	if (sndbuf_getready(ch->buffer)
+			- MOD_DIFF(
+				ch->unsubmittedptr,
+				sndbuf_getreadyptr(ch->buffer),
+				ch->buffer->bufsize
+			) < VCHIQ_AUDIO_PACKET_SIZE) {
 		ch->starved++;
 		return (true);
 	}
 
 	/* Not enough free space */
 	if (ch->available_space < VCHIQ_AUDIO_PACKET_SIZE) {
+		ch->log_vars.slept_for_lack_of_space = 1;
 		return (true);
 	}
 
@@ -481,8 +528,13 @@ bcm2835_audio_write_samples(struct bcm2835_audio_chinfo *ch, void *buf, uint32_t
 	m.type = VC_AUDIO_MSG_TYPE_WRITE;
 	m.u.write.count = count;
 	m.u.write.max_packet = VCHIQ_AUDIO_PACKET_SIZE;
-	m.u.write.callback = NULL;
-	m.u.write.cookie = ch;
+#if defined(__aarch64__)
+	m.u.write.callback = (uint32_t)(((size_t) ch) >> 32) & 0xffffffff;
+	m.u.write.cookie = (uint32_t)(((size_t) ch) & 0xffffffff);
+#else
+	m.u.write.callback = (uint32_t) NULL;
+	m.u.write.cookie = (uint32_t) ch;
+#endif
 	m.u.write.silence = 0;
 
 	ret = vchi_msg_queue(sc->vchi_handle,
@@ -529,6 +581,11 @@ bcm2835_audio_worker(void *data)
 		while ((sc->flags_pending == 0) &&
 		    bcm2835_audio_buffer_should_sleep(ch)) {
 			cv_wait_sig(&sc->worker_cv, &sc->lock);
+			if ((sc->flags_pending == 0) &&
+			    (ch->log_vars.slept_for_lack_of_space)) {
+				BCM2835_LOG_TRACE(sc,
+				    "slept for lack of space\n");
+			}
 		}
 		flags = sc->flags_pending;
 		/* Clear pending flags */
@@ -555,16 +612,25 @@ bcm2835_audio_worker(void *data)
 			BCM2835_AUDIO_LOCK(sc);
 			bcm2835_audio_reset_channel(&sc->pch);
 			ch->playback_state = PLAYBACK_IDLE;
+			long sub_total = ch->submitted_samples;
+			long retd = ch->retrieved_samples;
 			BCM2835_AUDIO_UNLOCK(sc);
+			BCM2835_LOG_INFO(sc,
+			    "stopped audio. submitted a total of %lu "
+			    "having been acked %lu\n", sub_total, retd);
 			continue;
 		}
 
 		/* Requested to start playback */
 		if ((flags & AUDIO_PLAY) &&
 		    (ch->playback_state == PLAYBACK_IDLE)) {
+			BCM2835_LOG_INFO(sc, "starting audio\n");
+			unsigned int bsize = ch->buffer->bufsize;
 			BCM2835_AUDIO_LOCK(sc);
 			ch->playback_state = PLAYBACK_PLAYING;
+			ch->log_vars.bsize = bsize;
 			BCM2835_AUDIO_UNLOCK(sc);
+			BCM2835_LOG_INFO(sc, "buffer size is %u\n", bsize);
 			bcm2835_audio_start(ch);
 		}
 
@@ -574,19 +640,65 @@ bcm2835_audio_worker(void *data)
 		if (sndbuf_getready(ch->buffer) == 0)
 			continue;
 
-		count = sndbuf_getready(ch->buffer);
+		uint32_t i_count;
+
+		/* XXXMDC Take unsubmitted stuff into account */
+		count = i_count = sndbuf_getready(ch->buffer)
+		    - MOD_DIFF(ch->unsubmittedptr,
+		     sndbuf_getreadyptr(ch->buffer),
+		     ch->buffer->bufsize);
 		size = ch->buffer->bufsize;
-		readyptr = sndbuf_getreadyptr(ch->buffer);
+		readyptr = ch->unsubmittedptr;
+
+		int size_changed = 0;
+		unsigned int available;
 
 		BCM2835_AUDIO_LOCK(sc);
-		if (readyptr + count > size)
+		if (size != ch->log_vars.bsize) {
+			ch->log_vars.bsize = size;
+			size_changed = 1;
+		}
+		available = ch->available_space;
+		/*
+		 *  XXXMDC
+		 *
+		 *  On arm64, got into situations where
+		 *  readyptr was less than a packet away
+		 *  from the end of the buffer, which led
+		 *  to count being set to 0 and, inexorably, starvation.
+		 *  Code below tries to take that into account.
+		 *  The problem might have been fixed with some of the
+		 *  other changes that were made in the meantime,
+		 *  but for now this works fine.
+		 */
+		if (readyptr + count > size) {
 			count = size - readyptr;
-		count = min(count, ch->available_space);
-		count -= (count % VCHIQ_AUDIO_PACKET_SIZE);
+		}
+		if(count > ch->available_space){
+			count = ch->available_space;
+			count -= (count % VCHIQ_AUDIO_PACKET_SIZE);
+		}else if (count > VCHIQ_AUDIO_PACKET_SIZE){
+			count -= (count % VCHIQ_AUDIO_PACKET_SIZE);
+		}else if (size > count + readyptr) {
+			count = 0;
+		}
 		BCM2835_AUDIO_UNLOCK(sc);
 
-		if (count < VCHIQ_AUDIO_PACKET_SIZE)
+		if (count % VCHIQ_AUDIO_PACKET_SIZE != 0) {
+			BCM2835_LOG_WARN(sc, "count: %u  initial count: %u  "
+			    "size: %u  readyptr: %u  available: %u\n", count,
+			    i_count,size,readyptr,available);
+		}
+		if (size_changed)
+		    BCM2835_LOG_INFO(sc, "bsize changed to %u\n", size);
+
+		if (count == 0) {
+			BCM2835_LOG_WARN(sc,
+			    "not enough room for a packet: count %d,"
+			    " i_count %d, rptr %d, size %d\n",
+			    count, i_count, readyptr, size);
 			continue;
+		}
 
 		buf = ch->buffer->buf + readyptr;
 
@@ -596,8 +708,16 @@ bcm2835_audio_worker(void *data)
 		    ch->buffer->bufsize;
 		ch->available_space -= count;
 		ch->submitted_samples += count;
+		long sub = count;
+		long sub_total = ch->submitted_samples;
+		long retd = ch->retrieved_samples;
 		KASSERT(ch->available_space >= 0, ("ch->available_space == %d\n", ch->available_space));
 		BCM2835_AUDIO_UNLOCK(sc);
+
+		BCM2835_LOG_TRACE(sc,
+		    "submitted %lu for a total of %lu having been acked %lu; "
+		    "rptr %d, had %u available\n", sub, sub_total, retd,
+		    readyptr, available);
 	}
 
 	BCM2835_AUDIO_LOCK(sc);
@@ -650,6 +770,8 @@ bcmchan_init(kobj_t obj, void *devinfo, struct snd_dbuf *b, struct pcm_channel *
 		return NULL;
 	}
 
+	ch->log_vars = DEFAULT_LOG_VALUES;
+
 	BCM2835_AUDIO_LOCK(sc);
 	bcm2835_worker_update_params(sc);
 	BCM2835_AUDIO_UNLOCK(sc);
diff --git a/sys/arm/broadcom/bcm2835/vc_vchi_audioserv_defs.h b/sys/arm/broadcom/bcm2835/vc_vchi_audioserv_defs.h
index 896e706ff492..ea972ff2d001 100644
--- a/sys/arm/broadcom/bcm2835/vc_vchi_audioserv_defs.h
+++ b/sys/arm/broadcom/bcm2835/vc_vchi_audioserv_defs.h
@@ -112,8 +112,8 @@ typedef struct
 typedef struct
 {
 	uint32_t count; /* in bytes */
-	void *callback;
-	void *cookie;
+	uint32_t callback;
+	uint32_t cookie;
 	uint16_t silence;
 	uint16_t max_packet;
 } VC_AUDIO_WRITE_T;
@@ -129,8 +129,8 @@ typedef struct
 typedef struct
 {
 	int32_t count;  /* Success value */
-	void *callback;
-	void *cookie;
+	uint32_t callback;
+	uint32_t cookie;
 } VC_AUDIO_COMPLETE_T;
 
 /* Message header for all messages in HOST->VC direction */
diff --git a/sys/arm64/conf/std.broadcom b/sys/arm64/conf/std.broadcom
index 3332aaac0826..65bee16e315d 100644
--- a/sys/arm64/conf/std.broadcom
+++ b/sys/arm64/conf/std.broadcom
@@ -33,5 +33,8 @@ device		sdhci
 options 	FDT
 device		acpi
 
+# Sound support
+device		vchiq
+
 # DTBs
 makeoptions	MODULES_EXTRA+="dtb/rpi"
diff --git a/sys/contrib/vchiq/interface/compat/vchi_bsd.c b/sys/contrib/vchiq/interface/compat/vchi_bsd.c
index 8f47b3dc02d6..08f2f66dfc54 100644
--- a/sys/contrib/vchiq/interface/compat/vchi_bsd.c
+++ b/sys/contrib/vchiq/interface/compat/vchi_bsd.c
@@ -340,7 +340,6 @@ down_interruptible(struct semaphore *s)
 	int ret ;
 
 	ret = 0;
-
 	mtx_lock(&s->mtx);
 
 	while (s->value == 0) {
@@ -348,13 +347,11 @@ down_interruptible(struct semaphore *s)
 		ret = cv_wait_sig(&s->cv, &s->mtx);
 		s->waiters--;
 
-		if (ret == EINTR) {
+		/* XXXMDC As per its semaphore.c, linux can only return EINTR */
+		if (ret) {
 			mtx_unlock(&s->mtx);
-			return (-EINTR);
+			return -EINTR;
 		}
-
-		if (ret == ERESTART)
-			continue;
 	}
 
 	s->value--;
@@ -441,8 +438,7 @@ flush_signals(VCHIQ_THREAD_T thr)
 int
 fatal_signal_pending(VCHIQ_THREAD_T thr)
 {
-	printf("Implement ME: %s\n", __func__);
-	return (0);
+	return (curproc_sigkilled());
 }
 
 /*
diff --git a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c
index 185e81e71bdc..7e105a6b3b77 100644
--- a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c
+++ b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c
@@ -65,9 +65,24 @@ MALLOC_DEFINE(M_VCPAGELIST, "vcpagelist", "VideoCore pagelist memory");
 
 #define MAX_FRAGMENTS (VCHIQ_NUM_CURRENT_BULKS * 2)
 
+/*
+ *  XXXMDC
+ * Do this less ad-hoc-y -- e.g.
+ * https://github.com/raspberrypi/linux/commit/c683db8860a80562a2bb5b451d77b3e471d24f36
+ */
+#if defined(__aarch64__)
+int g_cache_line_size = 64;
+#else
 int g_cache_line_size = 32;
+#endif
 static int g_fragment_size;
 
+unsigned int g_long_bulk_space = 0;
+#define VM_PAGE_TO_VC_BULK_PAGE(x) (\
+	g_long_bulk_space ? VM_PAGE_TO_PHYS(x)\
+		 : PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(x))\
+)
+
 typedef struct vchiq_2835_state_struct {
    int inited;
    VCHIQ_ARM_STATE_T arm_state;
@@ -113,6 +128,54 @@ vchiq_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 	*addr = PHYS_TO_VCBUS(segs[0].ds_addr);
 }
 
+#if defined(__aarch64__) /* See comment in free_pagelist */
+static int
+invalidate_cachelines_in_range_of_ppage(
+	vm_page_t p,
+	size_t offset,
+	size_t count
+)
+{
+	if(offset + count > PAGE_SIZE){ return EINVAL; }
+        uint8_t *dst = (uint8_t*)pmap_quick_enter_page(p);
+        if (!dst){
+                return ENOMEM;
+	}
+	cpu_dcache_inv_range((void *)((vm_offset_t)dst + offset), count);
+	pmap_quick_remove_page((vm_offset_t)dst);
+	return 0;
+}
+
+/* XXXMDC bulk instead of loading and invalidating single pages? */
+static void
+invalidate_cachelines_in_range_of_ppage_seq(vm_page_t *p, size_t start,
+    size_t count)
+{
+	if (start >= PAGE_SIZE)
+		goto invalid_input;
+
+#define _NEXT_AT(x,_m) (((x)+((_m)-1)) & ~((_m)-1))   /* for power of two m */
+	size_t offset = _NEXT_AT(start,g_cache_line_size);
+#undef _NEXT_AT
+	count = (offset < start + count) ? count - (offset - start) : 0;
+	offset = offset & (PAGE_SIZE - 1);
+	for (size_t done = 0; count > done;
+	    p++, done += PAGE_SIZE - offset, offset = 0) {
+		size_t in_page = PAGE_SIZE - offset;
+		size_t todo = (count-done > in_page) ? in_page : count-done;
+		int e = invalidate_cachelines_in_range_of_ppage(*p, offset, todo);
+		if (e != 0)
+			goto problem_in_loop;
+	}
+	return;
+
+problem_in_loop:
+invalid_input:
+	WARN_ON(1);
+	return;
+}
+#endif
+
 static int
 copyout_page(vm_page_t p, size_t offset, void *kaddr, size_t size)
 {
@@ -171,7 +234,7 @@ vchiq_platform_init(VCHIQ_STATE_T *state)
 		goto failed_load;
 	}
 
-	WARN_ON(((int)g_slot_mem & (PAGE_SIZE - 1)) != 0);
+	WARN_ON(((size_t)g_slot_mem & (PAGE_SIZE - 1)) != 0);
 
 	vchiq_slot_zero = vchiq_init_slots(g_slot_mem, g_slot_mem_size);
 	if (!vchiq_slot_zero) {
@@ -391,13 +454,14 @@ pagelist_page_free(vm_page_t pp)
 ** from increased speed as a result.
 */
 
+
 static int
 create_pagelist(char __user *buf, size_t count, unsigned short type,
 	struct proc *p, BULKINFO_T *bi)
 {
 	PAGELIST_T *pagelist;
 	vm_page_t* pages;
-	unsigned long *addrs;
+	uint32_t *addrs;
 	unsigned int num_pages, i;
 	vm_offset_t offset;
 	int pagelist_size;
@@ -434,7 +498,7 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
 
 	err = bus_dmamem_alloc(bi->pagelist_dma_tag, (void **)&pagelist,
 	    BUS_DMA_COHERENT | BUS_DMA_WAITOK, &bi->pagelist_dma_map);
-	if (err) {
+	if (err || !pagelist) {
 		vchiq_log_error(vchiq_core_log_level, "Unable to allocate pagelist memory");
 		err = -ENOMEM;
 		goto failed_alloc;
@@ -447,6 +511,7 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
 	if (err) {
 		vchiq_log_error(vchiq_core_log_level, "cannot load DMA map for pagelist memory");
 		err = -ENOMEM;
+		bi->pagelist = pagelist;
 		goto failed_load;
 	}
 
@@ -463,8 +528,9 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
 	if (actual_pages != num_pages) {
 		if (actual_pages > 0)
 			vm_page_unhold_pages(pages, actual_pages);
-		free(pagelist, M_VCPAGELIST);
-		return (-ENOMEM);
+		err = -ENOMEM;
+		bi->pagelist = pagelist;
+		goto failed_hold;
 	}
 
 	pagelist->length = count;
@@ -473,27 +539,28 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
 
 	/* Group the pages into runs of contiguous pages */
 
-	base_addr = (void *)PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(pages[0]));
+	size_t run_ceil = g_long_bulk_space ? 0x100 : PAGE_SIZE;
+	unsigned int pg_addr_rshift = g_long_bulk_space ? 4 : 0;
+	base_addr = (void *) VM_PAGE_TO_VC_BULK_PAGE(pages[0]);
 	next_addr = base_addr + PAGE_SIZE;
 	addridx = 0;
 	run = 0;
-
+#define _PG_BLOCK(base,run) \
+		((((size_t) (base)) >> pg_addr_rshift) & ~(run_ceil-1)) + (run)
 	for (i = 1; i < num_pages; i++) {
-		addr = (void *)PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(pages[i]));
-		if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) {
+		addr = (void *)VM_PAGE_TO_VC_BULK_PAGE(pages[i]);
+		if ((addr == next_addr) && (run < run_ceil - 1)) {
 			next_addr += PAGE_SIZE;
 			run++;
 		} else {
-			addrs[addridx] = (unsigned long)base_addr + run;
-			addridx++;
+			addrs[addridx++] = (uint32_t) _PG_BLOCK(base_addr,run);
 			base_addr = addr;
 			next_addr = addr + PAGE_SIZE;
 			run = 0;
 		}
 	}
-
-	addrs[addridx] = (unsigned long)base_addr + run;
-	addridx++;
+	addrs[addridx++] = _PG_BLOCK(base_addr, run);
+#undef _PG_BLOCK
 
 	/* Partial cache lines (fragments) require special measures */
 	if ((type == PAGELIST_READ) &&
@@ -514,20 +581,35 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
 		WARN_ON(fragments == NULL);
 		g_free_fragments = *(char **) g_free_fragments;
 		up(&g_free_fragments_mutex);
-		pagelist->type =
-			 PAGELIST_READ_WITH_FRAGMENTS + 
-			 (fragments - g_fragments_base)/g_fragment_size;
+		pagelist->type = PAGELIST_READ_WITH_FRAGMENTS
+		     + (fragments - g_fragments_base)/g_fragment_size;
+#if defined(__aarch64__)
+		 bus_dmamap_sync(bcm_slots_dma_tag, bcm_slots_dma_map,
+		     BUS_DMASYNC_PREREAD);
+#endif
 	}
 
+#if defined(__aarch64__)
+	if(type == PAGELIST_READ) {
+		cpu_dcache_wbinv_range(buf, count);
+	} else {
+		cpu_dcache_wb_range(buf, count);
+	}
+	dsb(sy);
+#else
 	pa = pmap_extract(PCPU_GET(curpmap), (vm_offset_t)buf);
 	dcache_wbinv_poc((vm_offset_t)buf, pa, count);
+#endif
 
-	bus_dmamap_sync(bi->pagelist_dma_tag, bi->pagelist_dma_map, BUS_DMASYNC_PREWRITE);
+	bus_dmamap_sync(bi->pagelist_dma_tag, bi->pagelist_dma_map,
+	    BUS_DMASYNC_PREWRITE);
 
 	bi->pagelist = pagelist;
 
 	return 0;
 
+failed_hold:
+	bus_dmamap_unload(bi->pagelist_dma_tag,bi->pagelist_dma_map);
 failed_load:
 	bus_dmamem_free(bi->pagelist_dma_tag, bi->pagelist, bi->pagelist_dma_map);
 failed_alloc:
@@ -556,6 +638,24 @@ free_pagelist(BULKINFO_T *bi, int actual)
 
 	pages = (vm_page_t*)(pagelist->addrs + num_pages);
 
+#if defined(__aarch64__)
+	/*
+         * On arm64, even if the user keeps their end of the bargain
+	 * -- do NOT touch the buffers sent to VC -- but reads around the
+	 * pagelist after the invalidation above, the arm might preemptively
+	 * load (and validate) cache lines for areas inside the page list,
+	 * so we must invalidate them again.
+	 *
+	 * The functional test does it and without this it doesn't pass.
+	 *
+	 * XXXMDC might it be enough to invalidate a couple of pages at
+	 * the ends of the page list?
+	 */
+	if(pagelist->type >= PAGELIST_READ && actual > 0)
+		invalidate_cachelines_in_range_of_ppage_seq(pages,
+		    pagelist->offset, actual);
+#endif
+
 	/* Deal with any partial cache lines (fragments) */
 	if (pagelist->type >= PAGELIST_READ_WITH_FRAGMENTS) {
 		char *fragments = g_fragments_base +
@@ -592,13 +692,18 @@ free_pagelist(BULKINFO_T *bi, int actual)
 		up(&g_free_fragments_sema);
 	}
 
-	for (i = 0; i < num_pages; i++) {
-		if (pagelist->type != PAGELIST_WRITE) {
+	if (pagelist->type != PAGELIST_WRITE) {
+		for (i = 0; i < num_pages; i++) {
 			vm_page_dirty(pages[i]);
 			pagelist_page_free(pages[i]);
 		}
 	}
 
+#if defined(__aarch64__)
+	/* XXXMDC necessary? */
+	dsb(sy);
+#endif
+
 	bus_dmamap_unload(bi->pagelist_dma_tag, bi->pagelist_dma_map);
 	bus_dmamem_free(bi->pagelist_dma_tag, bi->pagelist, bi->pagelist_dma_map);
 	bus_dma_tag_destroy(bi->pagelist_dma_tag);
diff --git a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c
index e25c4d738922..36f9d0e3410d 100644
--- a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c
+++ b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c
@@ -386,7 +386,7 @@ static void
 user_service_free(void *userdata)
 {
 	USER_SERVICE_T *user_service = userdata;
-	
+
 	_sema_destroy(&user_service->insert_event);
 	_sema_destroy(&user_service->remove_event);
 
@@ -410,7 +410,7 @@ static void close_delivered(USER_SERVICE_T *user_service)
 
 		/* Wake the user-thread blocked in close_ or remove_service */
 		up(&user_service->close_event);
- 
+
 		user_service->close_pending = 0;
 	}
 }
@@ -749,6 +749,7 @@ vchiq_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag,
 				(size_t)waiter, current->p_pid);
 			args.userdata = &waiter->bulk_waiter;
 		}
+
 		status = vchiq_bulk_transfer
 			(args.handle,
 			 VCHI_MEM_HANDLE_INVALID,
@@ -1093,7 +1094,7 @@ vchiq_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag,
 	} break;
 
 	case VCHIQ_IOC_LIB_VERSION: {
-		unsigned int lib_version = (unsigned int)arg;
+		size_t lib_version = (size_t)arg;
 
 		if (lib_version < VCHIQ_VERSION_MIN)
 			ret = -EINVAL;
@@ -1155,18 +1156,14 @@ vchiq_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag,
 	return ret;
 }
 
-static void
-instance_dtr(void *data)
-{
 
-	kfree(data);
-}
 
 /****************************************************************************
 *
 *   vchiq_open
 *
 ***************************************************************************/
+static void instance_dtr(void *data);
 
 static int
 vchiq_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
@@ -1206,7 +1203,7 @@ vchiq_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
 		INIT_LIST_HEAD(&instance->bulk_waiter_list);
 
 		devfs_set_cdevpriv(instance, instance_dtr);
-	} 
+	}
 	else {
 		vchiq_log_error(vchiq_arm_log_level,
 			"Unknown minor device");
@@ -1222,143 +1219,151 @@ vchiq_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
 *
 ***************************************************************************/
 
+
 static int
-vchiq_close(struct cdev *dev, int flags __unused, int fmt __unused,
-                struct thread *td)
+_vchiq_close_instance(VCHIQ_INSTANCE_T instance)
 {
 	int ret = 0;
-	if (1) {
-		VCHIQ_INSTANCE_T instance;
-		VCHIQ_STATE_T *state = vchiq_get_state();
-		VCHIQ_SERVICE_T *service;
-		int i;
-
-		if ((ret = devfs_get_cdevpriv((void**)&instance))) {
-			printf("devfs_get_cdevpriv failed: error %d\n", ret);
-			return (ret);
-		}
-
-		vchiq_log_info(vchiq_arm_log_level,
-			"vchiq_release: instance=%lx",
-			(unsigned long)instance);
-
-		if (!state) {
-			ret = -EPERM;
-			goto out;
-		}
+	VCHIQ_STATE_T *state = vchiq_get_state();
+	VCHIQ_SERVICE_T *service;
+	int i;
 
-		/* Ensure videocore is awake to allow termination. */
-		vchiq_use_internal(instance->state, NULL,
-				USE_TYPE_VCHIQ);
+	vchiq_log_info(vchiq_arm_log_level,
+		"vchiq_release: instance=%lx",
+		(unsigned long)instance);
 
-		lmutex_lock(&instance->completion_mutex);
+	if (!state) {
+		ret = -EPERM;
+		goto out;
+	}
 
-		/* Wake the completion thread and ask it to exit */
-		instance->closing = 1;
-		up(&instance->insert_event);
+	/* Ensure videocore is awake to allow termination. */
+	vchiq_use_internal(instance->state, NULL,
+			USE_TYPE_VCHIQ);
 
-		lmutex_unlock(&instance->completion_mutex);
+	lmutex_lock(&instance->completion_mutex);
 
-		/* Wake the slot handler if the completion queue is full. */
-		up(&instance->remove_event);
+	/* Wake the completion thread and ask it to exit */
+	instance->closing = 1;
+	up(&instance->insert_event);
 
-		/* Mark all services for termination... */
-		i = 0;
-		while ((service = next_service_by_instance(state, instance,
-			&i)) !=	NULL) {
-			USER_SERVICE_T *user_service = service->base.userdata;
+	lmutex_unlock(&instance->completion_mutex);
 
-			/* Wake the slot handler if the msg queue is full. */
-			up(&user_service->remove_event);
+	/* Wake the slot handler if the completion queue is full. */
+	up(&instance->remove_event);
 
-			vchiq_terminate_service_internal(service);
-			unlock_service(service);
-		}
+	/* Mark all services for termination... */
+	i = 0;
+	while ((service = next_service_by_instance(state, instance,
+		&i)) !=	NULL) {
+		USER_SERVICE_T *user_service = service->base.userdata;
 
-		/* ...and wait for them to die */
-		i = 0;
-		while ((service = next_service_by_instance(state, instance, &i))
-			!= NULL) {
-			USER_SERVICE_T *user_service = service->base.userdata;
+		/* Wake the slot handler if the msg queue is full. */
+		up(&user_service->remove_event);
 
-			down(&service->remove_event);
+		vchiq_terminate_service_internal(service);
+		unlock_service(service);
+	}
 
-			BUG_ON(service->srvstate != VCHIQ_SRVSTATE_FREE);
+	/* ...and wait for them to die */
+	i = 0;
+	while ((service = next_service_by_instance(state, instance, &i))
+		!= NULL) {
+		USER_SERVICE_T *user_service = service->base.userdata;
 
-			spin_lock(&msg_queue_spinlock);
+		down(&service->remove_event);
 
-			while (user_service->msg_remove !=
-				user_service->msg_insert) {
-				VCHIQ_HEADER_T *header = user_service->
-					msg_queue[user_service->msg_remove &
-						(MSG_QUEUE_SIZE - 1)];
-				user_service->msg_remove++;
-				spin_unlock(&msg_queue_spinlock);
+		BUG_ON(service->srvstate != VCHIQ_SRVSTATE_FREE);
 
-				if (header)
-					vchiq_release_message(
-						service->handle,
-						header);
-				spin_lock(&msg_queue_spinlock);
-			}
+		spin_lock(&msg_queue_spinlock);
 
+		while (user_service->msg_remove !=
+			user_service->msg_insert) {
+			VCHIQ_HEADER_T *header = user_service->
+				msg_queue[user_service->msg_remove &
+					(MSG_QUEUE_SIZE - 1)];
+			user_service->msg_remove++;
 			spin_unlock(&msg_queue_spinlock);
 
-			unlock_service(service);
+			if (header)
+				vchiq_release_message(
+					service->handle,
+					header);
+			spin_lock(&msg_queue_spinlock);
 		}
 
-		/* Release any closed services */
-		while (instance->completion_remove !=
-			instance->completion_insert) {
-			VCHIQ_COMPLETION_DATA_T *completion;
-			VCHIQ_SERVICE_T *service1;
-			completion = &instance->completions[
-				instance->completion_remove &
-				(MAX_COMPLETIONS - 1)];
-			service1 = completion->service_userdata;
-			if (completion->reason == VCHIQ_SERVICE_CLOSED)
-			{
-				USER_SERVICE_T *user_service =
-					service->base.userdata;
-
-				/* Wake any blocked user-thread */
-				if (instance->use_close_delivered)
-					up(&user_service->close_event);
-				unlock_service(service1);
-			}
-			instance->completion_remove++;
-		}
+		spin_unlock(&msg_queue_spinlock);
 
-		/* Release the PEER service count. */
-		vchiq_release_internal(instance->state, NULL);
+		unlock_service(service);
+	}
 
+	/* Release any closed services */
+	while (instance->completion_remove !=
+		instance->completion_insert) {
+		VCHIQ_COMPLETION_DATA_T *completion;
+		VCHIQ_SERVICE_T *service;
+		completion = &instance->completions[
+			instance->completion_remove &
+			(MAX_COMPLETIONS - 1)];
+		service = completion->service_userdata;
+		if (completion->reason == VCHIQ_SERVICE_CLOSED)
 		{
-			struct list_head *pos, *next;
-			list_for_each_safe(pos, next,
-				&instance->bulk_waiter_list) {
-				struct bulk_waiter_node *waiter;
-				waiter = list_entry(pos,
-					struct bulk_waiter_node,
-					list);
-				list_del(pos);
-				vchiq_log_info(vchiq_arm_log_level,
-					"bulk_waiter - cleaned up %x "
-					"for pid %d",
-					(unsigned int)waiter, waiter->pid);
-		                _sema_destroy(&waiter->bulk_waiter.event);
-				kfree(waiter);
-			}
-		}
+			USER_SERVICE_T *user_service =
+				service->base.userdata;
 
+			/* Wake any blocked user-thread */
+			if (instance->use_close_delivered)
+				up(&user_service->close_event);
+
+			unlock_service(service);
+		}
+		instance->completion_remove++;
 	}
-	else {
-		vchiq_log_error(vchiq_arm_log_level,
-			"Unknown minor device");
-		ret = -ENXIO;
+
+	/* Release the PEER service count. */
+	vchiq_release_internal(instance->state, NULL);
+
+	{
+		struct list_head *pos, *next;
+		list_for_each_safe(pos, next,
+			&instance->bulk_waiter_list) {
+			struct bulk_waiter_node *waiter;
+			waiter = list_entry(pos,
+				struct bulk_waiter_node,
+				list);
+			list_del(pos);
+			vchiq_log_info(vchiq_arm_log_level,
+				"bulk_waiter - cleaned up %zx "
+				"for pid %d",
+				(size_t)waiter, waiter->pid);
*** 476 LINES SKIPPED ***