git: a8ebe6902a15 - main - security/nettle: Fix build of assembly code on aarch64

From: Po-Chuan Hsieh <sunpoet_at_FreeBSD.org>
Date: Mon, 11 Jul 2022 13:54:38 UTC
The branch main has been updated by sunpoet:

URL: https://cgit.FreeBSD.org/ports/commit/?id=a8ebe6902a15a11102372d0575c18cc9a01f19b1

commit a8ebe6902a15a11102372d0575c18cc9a01f19b1
Author:     Po-Chuan Hsieh <sunpoet@FreeBSD.org>
AuthorDate: 2022-07-11 13:51:34 +0000
Commit:     Po-Chuan Hsieh <sunpoet@FreeBSD.org>
CommitDate: 2022-07-11 13:51:34 +0000

    security/nettle: Fix build of assembly code on aarch64
    
    PR:             264946
    Reported by:    diizzy
    Tested by:      diizzy (RockPro64 with GnuTLS on 13.1-RELEASE)
    Obtained from:  https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
---
 security/nettle/Makefile                           |   4 -
 security/nettle/files/patch-arm64-chacha-4core.asm | 146 +++++++++++++++++++++
 security/nettle/files/patch-fat-arm64.c            |  27 ++++
 .../files/patch-powerpc64-p7-chacha-4core.asm      | 130 ++++++++++++++++++
 .../nettle/files/patch-s390x-vf-chacha-4core.asm   | 130 ++++++++++++++++++
 5 files changed, 433 insertions(+), 4 deletions(-)

diff --git a/security/nettle/Makefile b/security/nettle/Makefile
index c3196b222ad4..2242322e6c36 100644
--- a/security/nettle/Makefile
+++ b/security/nettle/Makefile
@@ -36,10 +36,6 @@ EXAMPLES_USES=		ssl
 
 .include <bsd.port.options.mk>
 
-.if ${ARCH} == "aarch64"
-CONFIGURE_ARGS+=--disable-assembler
-.endif
-
 .if ${ARCH} == "sparc64"
 CONFIGURE_ENV+=	CCPIC=-fPIC
 .endif
diff --git a/security/nettle/files/patch-arm64-chacha-4core.asm b/security/nettle/files/patch-arm64-chacha-4core.asm
new file mode 100644
index 000000000000..2375fa618f1e
--- /dev/null
+++ b/security/nettle/files/patch-arm64-chacha-4core.asm
@@ -0,0 +1,146 @@
+Obtained from:	https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
+
+--- arm64/chacha-4core.asm.orig	2022-06-02 17:57:16 UTC
++++ arm64/chacha-4core.asm
+@@ -53,67 +53,74 @@ define(`TMP3', `v7')
+ 
+ define(`ROT24', `v8')
+ 
++C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
++C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
++define(`P1',
++`ifelse($1, 0, v16, $1, 1, v17, $1, 2, v18, $1, 3, v19, $1, 4, v20, $1, 5, v21, $1, 6, v22, $1, 7, v23, $1, 8, v24, $1, 9, v25, $1, 10, v26, $1, 11, v27, $1, 12, v28, $1, 13, v29, $1, 14, v30, $1, 15, v31)')
++define(`P2',
++`ifelse($1, 0, v16, $1, 1, v21, $1, 2, v26, $1, 3, v31, $1, 4, v20, $1, 5, v25, $1, 6, v30, $1, 7, v19, $1, 8, v24, $1, 9, v29, $1, 10, v18, $1, 11, v23, $1, 12, v28, $1, 13, v17, $1, 14, v22, $1, 15, v27)')
++
+ C Main loop for round
+ define(`QR',`
+-	add		$1.4s, $1.4s, $2.4s
+-	add		$5.4s, $5.4s, $6.4s
+-	add		$9.4s, $9.4s, $10.4s
+-	add		$13.4s, $13.4s, $14.4s
+-	eor		$4.16b, $4.16b, $1.16b
+-	eor		$8.16b, $8.16b, $5.16b
+-	eor		$12.16b, $12.16b, $9.16b
+-	eor		$16.16b, $16.16b, $13.16b
+-	rev32	$4.8h, $4.8h
+-	rev32	$8.8h, $8.8h
+-	rev32	$12.8h, $12.8h
+-	rev32	$16.8h, $16.8h
++	add		$1(0).4s, $1(0).4s, $1(1).4s
++	add		$1(4).4s, $1(4).4s, $1(5).4s
++	add		$1(8).4s, $1(8).4s, $1(9).4s
++	add		$1(12).4s, $1(12).4s, $1(13).4s
++	eor		$1(3).16b, $1(3).16b, $1(0).16b
++	eor		$1(7).16b, $1(7).16b, $1(4).16b
++	eor		$1(11).16b, $1(11).16b, $1(8).16b
++	eor		$1(15).16b, $1(15).16b, $1(12).16b
++	rev32	$1(3).8h, $1(3).8h
++	rev32	$1(7).8h, $1(7).8h
++	rev32	$1(11).8h, $1(11).8h
++	rev32	$1(15).8h, $1(15).8h
+ 
+-	add		$3.4s, $3.4s, $4.4s
+-	add		$7.4s, $7.4s, $8.4s
+-	add		$11.4s, $11.4s, $12.4s
+-	add		$15.4s, $15.4s, $16.4s
+-	eor		TMP0.16b, $2.16b, $3.16b
+-	eor		TMP1.16b, $6.16b, $7.16b
+-	eor		TMP2.16b, $10.16b, $11.16b
+-	eor		TMP3.16b, $14.16b, $15.16b
+-	ushr	$2.4s, TMP0.4s, #20
+-	ushr	$6.4s, TMP1.4s, #20
+-	ushr	$10.4s, TMP2.4s, #20
+-	ushr	$14.4s, TMP3.4s, #20
+-	sli		$2.4s, TMP0.4s, #12
+-	sli		$6.4s, TMP1.4s, #12
+-	sli		$10.4s, TMP2.4s, #12
+-	sli		$14.4s, TMP3.4s, #12
++	add		$1(2).4s, $1(2).4s, $1(3).4s
++	add		$1(6).4s, $1(6).4s, $1(7).4s
++	add		$1(10).4s, $1(10).4s, $1(11).4s
++	add		$1(14).4s, $1(14).4s, $1(15).4s
++	eor		TMP0.16b, $1(1).16b, $1(2).16b
++	eor		TMP1.16b, $1(5).16b, $1(6).16b
++	eor		TMP2.16b, $1(9).16b, $1(10).16b
++	eor		TMP3.16b, $1(13).16b, $1(14).16b
++	ushr	$1(1).4s, TMP0.4s, #20
++	ushr	$1(5).4s, TMP1.4s, #20
++	ushr	$1(9).4s, TMP2.4s, #20
++	ushr	$1(13).4s, TMP3.4s, #20
++	sli		$1(1).4s, TMP0.4s, #12
++	sli		$1(5).4s, TMP1.4s, #12
++	sli		$1(9).4s, TMP2.4s, #12
++	sli		$1(13).4s, TMP3.4s, #12
+ 
+-	add		$1.4s, $1.4s, $2.4s
+-	add		$5.4s, $5.4s, $6.4s
+-	add		$9.4s, $9.4s, $10.4s
+-	add		$13.4s, $13.4s, $14.4s
+-	eor		$4.16b, $4.16b, $1.16b
+-	eor		$8.16b, $8.16b, $5.16b
+-	eor		$12.16b, $12.16b, $9.16b
+-	eor		$16.16b, $16.16b, $13.16b
+-	tbl		$4.16b, {$4.16b}, ROT24.16b
+-	tbl		$8.16b, {$8.16b}, ROT24.16b
+-	tbl		$12.16b, {$12.16b}, ROT24.16b
+-	tbl		$16.16b, {$16.16b}, ROT24.16b
++	add		$1(0).4s, $1(0).4s, $1(1).4s
++	add		$1(4).4s, $1(4).4s, $1(5).4s
++	add		$1(8).4s, $1(8).4s, $1(9).4s
++	add		$1(12).4s, $1(12).4s, $1(13).4s
++	eor		$1(3).16b, $1(3).16b, $1(0).16b
++	eor		$1(7).16b, $1(7).16b, $1(4).16b
++	eor		$1(11).16b, $1(11).16b, $1(8).16b
++	eor		$1(15).16b, $1(15).16b, $1(12).16b
++	tbl		$1(3).16b, {$1(3).16b}, ROT24.16b
++	tbl		$1(7).16b, {$1(7).16b}, ROT24.16b
++	tbl		$1(11).16b, {$1(11).16b}, ROT24.16b
++	tbl		$1(15).16b, {$1(15).16b}, ROT24.16b
+ 
+-	add		$3.4s, $3.4s, $4.4s
+-	add		$7.4s, $7.4s, $8.4s
+-	add		$11.4s, $11.4s, $12.4s
+-	add		$15.4s, $15.4s, $16.4s
+-	eor		TMP0.16b, $2.16b, $3.16b
+-	eor		TMP1.16b, $6.16b, $7.16b
+-	eor		TMP2.16b, $10.16b, $11.16b
+-	eor		TMP3.16b, $14.16b, $15.16b
+-	ushr	$2.4s, TMP0.4s, #25
+-	ushr	$6.4s, TMP1.4s, #25
+-	ushr	$10.4s, TMP2.4s, #25
+-	ushr	$14.4s, TMP3.4s, #25
+-	sli		$2.4s, TMP0.4s, #7
+-	sli		$6.4s, TMP1.4s, #7
+-	sli		$10.4s, TMP2.4s, #7
+-	sli		$14.4s, TMP3.4s, #7
++	add		$1(2).4s, $1(2).4s, $1(3).4s
++	add		$1(6).4s, $1(6).4s, $1(7).4s
++	add		$1(10).4s, $1(10).4s, $1(11).4s
++	add		$1(14).4s, $1(14).4s, $1(15).4s
++	eor		TMP0.16b, $1(1).16b, $1(2).16b
++	eor		TMP1.16b, $1(5).16b, $1(6).16b
++	eor		TMP2.16b, $1(9).16b, $1(10).16b
++	eor		TMP3.16b, $1(13).16b, $1(14).16b
++	ushr	$1(1).4s, TMP0.4s, #25
++	ushr	$1(5).4s, TMP1.4s, #25
++	ushr	$1(9).4s, TMP2.4s, #25
++	ushr	$1(13).4s, TMP3.4s, #25
++	sli		$1(1).4s, TMP0.4s, #7
++	sli		$1(5).4s, TMP1.4s, #7
++	sli		$1(9).4s, TMP2.4s, #7
++	sli		$1(13).4s, TMP3.4s, #7
+ ')
+ 
+ define(`TRANSPOSE',`
+@@ -174,8 +181,8 @@ C Load state and splat
+ 	mov		T3.16b, v31.16b
+ 
+ .Loop:
+-	QR(v16, v17,  v18, v19, v20, v21,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+-	QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29,  v18, v23, v28, v17,  v22, v27)
++	QR(`P1')
++	QR(`P2')
+ 	subs	ROUNDS, ROUNDS, #2
+ 	b.ne	.Loop
+ 
diff --git a/security/nettle/files/patch-fat-arm64.c b/security/nettle/files/patch-fat-arm64.c
new file mode 100644
index 000000000000..b1e9cefc943f
--- /dev/null
+++ b/security/nettle/files/patch-fat-arm64.c
@@ -0,0 +1,27 @@
+--- fat-arm64.c.orig	2022-06-02 17:57:15 UTC
++++ fat-arm64.c
+@@ -46,6 +46,9 @@
+ #  include <asm/hwcap.h>
+ #  include <sys/auxv.h>
+ # endif
++#elif defined(__FreeBSD__)
++#  define USE_GETAUXVAL 1
++#  include <sys/auxv.h>
+ #endif
+ 
+ #include "nettle-types.h"
+@@ -113,7 +116,14 @@ get_arm64_features (struct arm64_features *features)
+   else
+     {
+ #if USE_GETAUXVAL
++#if defined (__FreeBSD__)
++    unsigned long hwcap;
++    if(elf_aux_info(AT_HWCAP, &hwcap, sizeof(unsigned long)) != 0) {
++        hwcap = 0;
++    }
++#else
+       unsigned long hwcap = getauxval(AT_HWCAP);
++#endif
+       features->have_aes
+ 	= ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
+       features->have_pmull
diff --git a/security/nettle/files/patch-powerpc64-p7-chacha-4core.asm b/security/nettle/files/patch-powerpc64-p7-chacha-4core.asm
new file mode 100644
index 000000000000..bb1032117cdb
--- /dev/null
+++ b/security/nettle/files/patch-powerpc64-p7-chacha-4core.asm
@@ -0,0 +1,130 @@
+Obtained from:	https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
+
+--- powerpc64/p7/chacha-4core.asm.orig	2022-06-02 17:57:16 UTC
++++ powerpc64/p7/chacha-4core.asm
+@@ -53,59 +53,66 @@ define(`T1', `v21')
+ define(`T2', `v22')
+ define(`T3', `v23')
+ 
++C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
++C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
++define(`P1',
++`ifelse($1, 0, v0, $1, 1, v4, $1, 2, v8, $1, 3, v12, $1, 4, v1, $1, 5, v5, $1, 6, v9, $1, 7, v13, $1, 8, v2, $1, 9, v6, $1, 10, v10, $1, 11, v14, $1, 12, v3, $1, 13, v7, $1, 14, v11, $1, 15, v15)')
++define(`P2',
++`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v1, $1, 5, v6, $1, 6, v11, $1, 7, v12, $1, 8, v2, $1, 9, v7, $1, 10, v8, $1, 11, v13, $1, 12, v3, $1, 13, v4, $1, 14, v9, $1, 15, v14)')
++
+ C Main loop for round
+ define(`QR',`
+-	vadduwm $1, $1, $2
+-	vadduwm $5, $5, $6
+-	vadduwm $9, $9, $10
+-	vadduwm $13, $13, $14
+-	vxor	$4, $4, $1
+-	vxor	$8, $8, $5
+-	vxor	$12, $12, $9
+-	vxor	$16, $16, $13
+-	vrlw	$4, $4, ROT16
+-	vrlw	$8, $8, ROT16
+-	vrlw	$12, $12, ROT16
+-	vrlw	$16, $16, ROT16
++	vadduwm $1(0), $1(0), $1(1)
++	vadduwm $1(4), $1(4), $1(5)
++	vadduwm $1(8), $1(8), $1(9)
++	vadduwm $1(12), $1(12), $1(13)
++	vxor	$1(3), $1(3), $1(0)
++	vxor	$1(7), $1(7), $1(4)
++	vxor	$1(11), $1(11), $1(8)
++	vxor	$1(15), $1(15), $1(12)
++	vrlw	$1(3), $1(3), ROT16
++	vrlw	$1(7), $1(7), ROT16
++	vrlw	$1(11), $1(11), ROT16
++	vrlw	$1(15), $1(15), ROT16
+ 
+-	vadduwm $3, $3, $4
+-	vadduwm $7, $7, $8
+-	vadduwm $11, $11, $12
+-	vadduwm $15, $15, $16
+-	vxor	$2, $2, $3
+-	vxor	$6, $6, $7
+-	vxor	$10, $10, $11
+-	vxor	$14, $14, $15
+-	vrlw	$2, $2, ROT12
+-	vrlw	$6, $6, ROT12
+-	vrlw	$10, $10, ROT12
+-	vrlw	$14, $14, ROT12
++	vadduwm $1(2), $1(2), $1(3)
++	vadduwm $1(6), $1(6), $1(7)
++	vadduwm $1(10), $1(10), $1(11)
++	vadduwm $1(14), $1(14), $1(15)
++	vxor	$1(1), $1(1), $1(2)
++	vxor	$1(5), $1(5), $1(6)
++	vxor	$1(9), $1(9), $1(10)
++	vxor	$1(13), $1(13), $1(14)
++	vrlw	$1(1), $1(1), ROT12
++	vrlw	$1(5), $1(5), ROT12
++	vrlw	$1(9), $1(9), ROT12
++	vrlw	$1(13), $1(13), ROT12
+ 
+-	vadduwm $1, $1, $2
+-	vadduwm $5, $5, $6
+-	vadduwm $9, $9, $10
+-	vadduwm $13, $13, $14
+-	vxor	$4, $4, $1
+-	vxor	$8, $8, $5
+-	vxor	$12, $12, $9
+-	vxor	$16, $16, $13
+-	vrlw	$4, $4, ROT8
+-	vrlw	$8, $8, ROT8
+-	vrlw	$12, $12, ROT8
+-	vrlw	$16, $16, ROT8
++	vadduwm $1(0), $1(0), $1(1)
++	vadduwm $1(4), $1(4), $1(5)
++	vadduwm $1(8), $1(8), $1(9)
++	vadduwm $1(12), $1(12), $1(13)
++	vxor	$1(3), $1(3), $1(0)
++	vxor	$1(7), $1(7), $1(4)
++	vxor	$1(11), $1(11), $1(8)
++	vxor	$1(15), $1(15), $1(12)
++	vrlw	$1(3), $1(3), ROT8
++	vrlw	$1(7), $1(7), ROT8
++	vrlw	$1(11), $1(11), ROT8
++	vrlw	$1(15), $1(15), ROT8
+ 
+-	vadduwm $3, $3, $4
+-	vadduwm $7, $7, $8
+-	vadduwm $11, $11, $12
+-	vadduwm $15, $15, $16
+-	vxor	$2, $2, $3
+-	vxor	$6, $6, $7
+-	vxor	$10, $10, $11
+-	vxor	$14, $14, $15
+-	vrlw	$2, $2, ROT7
+-	vrlw	$6, $6, ROT7
+-	vrlw	$10, $10, ROT7
+-	vrlw	$14, $14, ROT7
++	vadduwm $1(2), $1(2), $1(3)
++	vadduwm $1(6), $1(6), $1(7)
++	vadduwm $1(10), $1(10), $1(11)
++	vadduwm $1(14), $1(14), $1(15)
++	vxor	$1(1), $1(1), $1(2)
++	vxor	$1(5), $1(5), $1(6)
++	vxor	$1(9), $1(9), $1(10)
++	vxor	$1(13), $1(13), $1(14)
++	vrlw	$1(1), $1(1), ROT7
++	vrlw	$1(5), $1(5), ROT7
++	vrlw	$1(9), $1(9), ROT7
++	vrlw	$1(13), $1(13), ROT7
+ ')
+ 
+ define(`TRANSPOSE',`
+@@ -185,8 +192,8 @@ C Load state and splat
+ 	srdi	ROUNDS, ROUNDS, 1
+ 	mtctr	ROUNDS
+ .Loop:
+-	QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+-	QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
++	QR(`P1')
++	QR(`P2')
+ 	bdnz	.Loop
+ 
+ 	C Add in saved original words, including counters, before
diff --git a/security/nettle/files/patch-s390x-vf-chacha-4core.asm b/security/nettle/files/patch-s390x-vf-chacha-4core.asm
new file mode 100644
index 000000000000..23cb5766a37b
--- /dev/null
+++ b/security/nettle/files/patch-s390x-vf-chacha-4core.asm
@@ -0,0 +1,130 @@
+Obtained from:	https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
+
+--- s390x/vf/chacha-4core.asm.orig	2022-06-02 17:57:16 UTC
++++ s390x/vf/chacha-4core.asm
+@@ -48,59 +48,66 @@ define(`T3', `%v28')
+ define(`T2', `%v27')
+ define(`T3', `%v28')
+ 
++C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
++C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
++define(`P1',
++`ifelse($1, 0, v0, $1, 1, v1, $1, 2, v2, $1, 3, v3, $1, 4, v4, $1, 5, v5, $1, 6, v6, $1, 7, v7, $1, 8, v8, $1, 9, v9, $1, 10, v10, $1, 11, v11, $1, 12, v12, $1, 13, v13, $1, 14, v14, $1, 15, v15)')
++define(`P2',
++`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v4, $1, 5, v9, $1, 6, v14, $1, 7, v3, $1, 8, v8, $1, 9, v13, $1, 10, v2, $1, 11, v7, $1, 12, v12, $1, 13, v1, $1, 14, v6, $1, 15, v11)')
++
+ C Main loop for round
+ define(`QR',`
+-	vaf		$1, $1, $2
+-	vaf		$5, $5, $6
+-	vaf		$9, $9, $10
+-	vaf		$13, $13, $14
+-	vx		$4, $4, $1
+-	vx		$8, $8, $5
+-	vx		$12, $12, $9
+-	vx		$16, $16, $13
+-	verllf	$4, $4, 16
+-	verllf	$8, $8, 16
+-	verllf	$12, $12, 16
+-	verllf	$16, $16, 16
++	vaf		$1(0), $1(0), $1(1)
++	vaf		$1(4), $1(4), $1(5)
++	vaf		$1(8), $1(8), $1(9)
++	vaf		$1(12), $1(12), $1(13)
++	vx		$1(3), $1(3), $1(0)
++	vx		$1(7), $1(7), $1(4)
++	vx		$1(11), $1(11), $1(8)
++	vx		$1(15), $1(15), $1(12)
++	verllf	$1(3), $1(3), 16
++	verllf	$1(7), $1(7), 16
++	verllf	$1(11), $1(11), 16
++	verllf	$1(15), $1(15), 16
+ 
+-	vaf		$3, $3, $4
+-	vaf		$7, $7, $8
+-	vaf		$11, $11, $12
+-	vaf		$15, $15, $16
+-	vx		$2, $2, $3
+-	vx		$6, $6, $7
+-	vx		$10, $10, $11
+-	vx		$14, $14, $15
+-	verllf	$2, $2, 12
+-	verllf	$6, $6, 12
+-	verllf	$10, $10, 12
+-	verllf	$14, $14, 12
++	vaf		$1(2), $1(2), $1(3)
++	vaf		$1(6), $1(6), $1(7)
++	vaf		$1(10), $1(10), $1(11)
++	vaf		$1(14), $1(14), $1(15)
++	vx		$1(1), $1(1), $1(2)
++	vx		$1(5), $1(5), $1(6)
++	vx		$1(9), $1(9), $1(10)
++	vx		$1(13), $1(13), $1(14)
++	verllf	$1(1), $1(1), 12
++	verllf	$1(5), $1(5), 12
++	verllf	$1(9), $1(9), 12
++	verllf	$1(13), $1(13), 12
+ 
+-	vaf		$1, $1, $2
+-	vaf		$5, $5, $6
+-	vaf		$9, $9, $10
+-	vaf		$13, $13, $14
+-	vx		$4, $4, $1
+-	vx		$8, $8, $5
+-	vx		$12, $12, $9
+-	vx		$16, $16, $13
+-	verllf	$4, $4, 8
+-	verllf	$8, $8, 8
+-	verllf	$12, $12, 8
+-	verllf	$16, $16, 8
++	vaf		$1(0), $1(0), $1(1)
++	vaf		$1(4), $1(4), $1(5)
++	vaf		$1(8), $1(8), $1(9)
++	vaf		$1(12), $1(12), $1(13)
++	vx		$1(3), $1(3), $1(0)
++	vx		$1(7), $1(7), $1(4)
++	vx		$1(11), $1(11), $1(8)
++	vx		$1(15), $1(15), $1(12)
++	verllf	$1(3), $1(3), 8
++	verllf	$1(7), $1(7), 8
++	verllf	$1(11), $1(11), 8
++	verllf	$1(15), $1(15), 8
+ 
+-	vaf		$3, $3, $4
+-	vaf		$7, $7, $8
+-	vaf		$11, $11, $12
+-	vaf		$15, $15, $16
+-	vx		$2, $2, $3
+-	vx		$6, $6, $7
+-	vx		$10, $10, $11
+-	vx		$14, $14, $15
+-	verllf	$2, $2, 7
+-	verllf	$6, $6, 7
+-	verllf	$10, $10, 7
+-	verllf	$14, $14, 7
++	vaf		$1(2), $1(2), $1(3)
++	vaf		$1(6), $1(6), $1(7)
++	vaf		$1(10), $1(10), $1(11)
++	vaf		$1(14), $1(14), $1(15)
++	vx		$1(1), $1(1), $1(2)
++	vx		$1(5), $1(5), $1(6)
++	vx		$1(9), $1(9), $1(10)
++	vx		$1(13), $1(13), $1(14)
++	verllf	$1(1), $1(1), 7
++	verllf	$1(5), $1(5), 7
++	verllf	$1(9), $1(9), 7
++	verllf	$1(13), $1(13), 7
+ ')
+ 
+ define(`TRANSPOSE',`
+@@ -176,8 +183,8 @@ C Load state and splat
+ 	srlg	ROUNDS, ROUNDS, 1
+ 
+ .Loop:
+-	QR(%v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7, %v8, %v9, %v10, %v11, %v12, %v13, %v14, %v15)
+-	QR(%v0, %v5, %v10, %v15, %v4, %v9, %v14, %v3, %v8, %v13, %v2, %v7, %v12, %v1, %v6, %v11)
++	QR(`P1')
++	QR(`P2')
+ 	brctg	ROUNDS, .Loop
+ 
+ 	C Add in saved original words, including counters, before