svn commit: r360584 - in projects/nfs-over-tls/sys: dev/cxgbe/tom kern modules opencrypto sys

Rick Macklem rmacklem at FreeBSD.org
Sun May 3 00:46:15 UTC 2020


Author: rmacklem
Date: Sun May  3 00:46:14 2020
New Revision: 360584
URL: https://svnweb.freebsd.org/changeset/base/360584

Log:
  Add jhb@'s ktls rx patch to the kernel.
  
  I also took out a few modules from sys/modules/Makefile, since those
  do not build without a full src tree and not just sys.

Modified:
  projects/nfs-over-tls/sys/dev/cxgbe/tom/t4_tom.c
  projects/nfs-over-tls/sys/kern/uipc_ktls.c
  projects/nfs-over-tls/sys/kern/uipc_sockbuf.c
  projects/nfs-over-tls/sys/kern/uipc_socket.c
  projects/nfs-over-tls/sys/modules/Makefile
  projects/nfs-over-tls/sys/opencrypto/ktls_ocf.c
  projects/nfs-over-tls/sys/sys/ktls.h
  projects/nfs-over-tls/sys/sys/sockbuf.h
  projects/nfs-over-tls/sys/sys/socket.h
  projects/nfs-over-tls/sys/sys/socketvar.h

Modified: projects/nfs-over-tls/sys/dev/cxgbe/tom/t4_tom.c
==============================================================================
--- projects/nfs-over-tls/sys/dev/cxgbe/tom/t4_tom.c	Sun May  3 00:37:16 2020	(r360583)
+++ projects/nfs-over-tls/sys/dev/cxgbe/tom/t4_tom.c	Sun May  3 00:46:14 2020	(r360584)
@@ -1080,7 +1080,9 @@ is_tls_sock(struct socket *so, struct adapter *sc)
 	struct inpcb *inp = sotoinpcb(so);
 	int i, rc;
 
-	/* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */
+	if (so_options_get(so) & SO_WANT_KTLS)
+		return (1);
+
 	rc = 0;
 	ADAPTER_LOCK(sc);
 	for (i = 0; i < sc->tt.num_tls_rx_ports; i++) {

Modified: projects/nfs-over-tls/sys/kern/uipc_ktls.c
==============================================================================
--- projects/nfs-over-tls/sys/kern/uipc_ktls.c	Sun May  3 00:37:16 2020	(r360583)
+++ projects/nfs-over-tls/sys/kern/uipc_ktls.c	Sun May  3 00:46:14 2020	(r360584)
@@ -79,7 +79,8 @@ __FBSDID("$FreeBSD$");
 
 struct ktls_wq {
 	struct mtx	mtx;
-	STAILQ_HEAD(, mbuf_ext_pgs) head;
+	STAILQ_HEAD(, mbuf_ext_pgs) pgs_head;
+	STAILQ_HEAD(, socket) so_head;
 	bool		running;
 } __aligned(CACHE_LINE_SIZE);
 
@@ -131,10 +132,16 @@ static counter_u64_t ktls_tasks_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
-static counter_u64_t ktls_cnt_on;
-SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, so_inqueue, CTLFLAG_RD,
-    &ktls_cnt_on, "Number of TLS records in queue to tasks for SW crypto");
+static counter_u64_t ktls_cnt_tx_queued;
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
+    &ktls_cnt_tx_queued,
+    "Number of TLS records in queue to tasks for SW encryption");
 
+static counter_u64_t ktls_cnt_rx_queued;
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
+    &ktls_cnt_rx_queued,
+    "Number of TLS sockets in queue to tasks for SW decryption");
+
 static counter_u64_t ktls_offload_total;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
@@ -149,6 +156,10 @@ static counter_u64_t ktls_offload_active;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
+static counter_u64_t ktls_offload_corrupted_records;
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
+    &ktls_offload_corrupted_records, "Total corrupted TLS records received");
+
 static counter_u64_t ktls_offload_failed_crypto;
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
@@ -334,10 +345,12 @@ ktls_init(void *dummy __unused)
 	int error, i;
 
 	ktls_tasks_active = counter_u64_alloc(M_WAITOK);
-	ktls_cnt_on = counter_u64_alloc(M_WAITOK);
+	ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK);
+	ktls_cnt_rx_queued = counter_u64_alloc(M_WAITOK);
 	ktls_offload_total = counter_u64_alloc(M_WAITOK);
 	ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK);
 	ktls_offload_active = counter_u64_alloc(M_WAITOK);
+	ktls_offload_corrupted_records = counter_u64_alloc(M_WAITOK);
 	ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK);
 	ktls_switch_to_sw = counter_u64_alloc(M_WAITOK);
@@ -370,7 +383,8 @@ ktls_init(void *dummy __unused)
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
-		STAILQ_INIT(&ktls_wq[i].head);
+		STAILQ_INIT(&ktls_wq[i].pgs_head);
+		STAILQ_INIT(&ktls_wq[i].so_head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
 		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
@@ -859,7 +873,7 @@ ktls_try_ifnet(struct socket *so, struct ktls_session 
 }
 
 static int
-ktls_try_sw(struct socket *so, struct ktls_session *tls)
+ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct rm_priotracker prio;
 	struct ktls_crypto_backend *be;
@@ -874,7 +888,7 @@ ktls_try_sw(struct socket *so, struct ktls_session *tl
 	if (ktls_allow_unload)
 		rm_rlock(&ktls_backends_lock, &prio);
 	LIST_FOREACH(be, &ktls_backends, next) {
-		if (be->try(so, tls) == 0)
+		if (be->try(so, tls, direction) == 0)
 			break;
 		KASSERT(tls->cipher == NULL,
 		    ("ktls backend leaked a cipher pointer"));
@@ -900,6 +914,61 @@ ktls_try_sw(struct socket *so, struct ktls_session *tl
 	return (0);
 }
 
+/*
+ * KTLS RX stores data in the socket buffer as a list of TLS records,
+ * where each record is stored as a control message containg the TLS
+ * header followed by data mbufs containing the decrypted data.  This
+ * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
+ * both encrypted and decrypted data.  TLS records decrypted by a NIC
+ * should be queued to the socket buffer as records, but encrypted
+ * data which needs to be decrypted by software arrives as a stream of
+ * regular mbufs which need to be converted.  In addition, there may
+ * already be pending encrypted data in the socket buffer when KTLS RX
+ * is enabled.
+ *
+ * To manage not-yet-decrypted data for KTLS RX, the following scheme
+ * is used:
+ *
+ * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
+ *
+ * - ktls_check_rx checks this chain of mbufs reading the TLS header
+ *   from the first mbuf.  Once all of the data for that TLS record is
+ *   queued, the socket is queued to a worker thread.
+ *
+ * - The worker thread calls ktls_decrypt to decrypt TLS records in
+ *   the TLS chain.  Each TLS record is detached from the TLS chain,
+ *   decrypted, and inserted into the regular socket buffer chain as
+ *   record starting with a control message holding the TLS header and
+ *   a chain of mbufs holding the encrypted data.
+ */
+
+static void
+sb_mark_notready(struct sockbuf *sb)
+{
+	struct mbuf *m;
+
+	m = sb->sb_mb;
+	sb->sb_mtls = m;
+	sb->sb_mb = NULL;
+	sb->sb_mbtail = NULL;
+	sb->sb_lastrecord = NULL;
+	for (; m != NULL; m = m->m_next) {
+		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
+		    __func__));
+		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
+		    __func__));
+		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
+		    __func__));
+		m->m_flags |= M_NOTREADY;
+		sb->sb_acc -= m->m_len;
+		sb->sb_tlscc += m->m_len;
+		sb->sb_mtlstail = m;
+	}
+	KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
+	    ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
+	    sb->sb_ccc));
+}
+
 int
 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 {
@@ -928,16 +997,20 @@ ktls_enable_rx(struct socket *so, struct tls_enable *e
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
+	/* TLS 1.3 is not yet supported. */
+	if (en->tls_vmajor == TLS_MAJOR_VER_ONE &&
+	    en->tls_vminor == TLS_MINOR_VER_THREE)
+		return (ENOTSUP);
+
 	error = ktls_create_session(so, en, &tls);
 	if (error)
 		return (error);
 
-	/* TLS RX offload is only supported on TOE currently. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_RX);
-#else
-	error = EOPNOTSUPP;
+	if (error)
 #endif
+		error = ktls_try_sw(so, tls, KTLS_RX);
 
 	if (error) {
 		ktls_cleanup(tls);
@@ -946,7 +1019,13 @@ ktls_enable_rx(struct socket *so, struct tls_enable *e
 
 	/* Mark the socket as using TLS offload. */
 	SOCKBUF_LOCK(&so->so_rcv);
+	so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_rcv.sb_tls_info = tls;
+	so->so_rcv.sb_flags |= SB_TLS_RX;
+
+	/* Mark existing data as not ready until it can be decrypted. */
+	sb_mark_notready(&so->so_rcv);
+	ktls_check_rx(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	counter_u64_add(ktls_offload_total, 1);
@@ -997,7 +1076,7 @@ ktls_enable_tx(struct socket *so, struct tls_enable *e
 #endif
 		error = ktls_try_ifnet(so, tls, false);
 	if (error)
-		error = ktls_try_sw(so, tls);
+		error = ktls_try_sw(so, tls, KTLS_TX);
 
 	if (error) {
 		ktls_cleanup(tls);
@@ -1102,7 +1181,7 @@ ktls_set_tx_mode(struct socket *so, int mode)
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, true);
 	else
-		error = ktls_try_sw(so, tls_new);
+		error = ktls_try_sw(so, tls_new, KTLS_TX);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
@@ -1430,6 +1509,371 @@ ktls_frame(struct mbuf *top, struct ktls_session *tls,
 }
 
 void
+ktls_check_rx(struct sockbuf *sb)
+{
+	struct tls_record_layer hdr;
+	struct ktls_wq *wq;
+	struct socket *so;
+	bool running;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
+	    __func__, sb));
+	so = __containerof(sb, struct socket, so_rcv);
+
+	if (sb->sb_flags & SB_TLS_RX_RUNNING)
+		return;
+
+	/* Is there enough queued for a TLS header? */
+	if (sb->sb_tlscc < sizeof(hdr)) {
+		if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
+			so->so_error = EMSGSIZE;
+		return;
+	}
+
+	m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
+
+	/* Is the entire record queued? */
+	if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
+		if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
+			so->so_error = EMSGSIZE;
+		return;
+	}
+
+	sb->sb_flags |= SB_TLS_RX_RUNNING;
+
+	soref(so);
+	wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
+	mtx_lock(&wq->mtx);
+	STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
+	running = wq->running;
+	mtx_unlock(&wq->mtx);
+	if (!running)
+		wakeup(wq);
+	counter_u64_add(ktls_cnt_rx_queued, 1);
+}
+
+static struct mbuf *
+ktls_detach_record(struct sockbuf *sb, int len)
+{
+	struct mbuf *m, *n, *top;
+	int remain;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	MPASS(len <= sb->sb_tlscc);
+
+	/*
+	 * If TLS chain is the exact size of the record,
+	 * just grab the whole record.
+	 */
+	top = sb->sb_mtls;
+	if (sb->sb_tlscc == len) {
+		sb->sb_mtls = NULL;
+		sb->sb_mtlstail = NULL;
+		goto out;
+	}
+
+	/*
+	 * While it would be nice to use m_split() here, we need
+	 * to know exactly what m_split() allocates to update the
+	 * accounting, so do it inline instead.
+	 */
+	remain = len;
+	for (m = top; remain > m->m_len; m = m->m_next)
+		remain -= m->m_len;
+
+	/* Easy case: don't have to split 'm'. */
+	if (remain == m->m_len) {
+		sb->sb_mtls = m->m_next;
+		if (sb->sb_mtls == NULL)
+			sb->sb_mtlstail = NULL;
+		m->m_next = NULL;
+		goto out;
+	}
+
+	/*
+	 * Need to allocate an mbuf to hold the remainder of 'm'.  Try
+	 * with M_NOWAIT first.
+	 */
+	n = m_get(M_NOWAIT, MT_DATA);
+	if (n == NULL) {
+		/*
+		 * Use M_WAITOK with socket buffer unlocked.  If
+		 * 'sb_mtls' changes while the lock is dropped, return
+		 * NULL to force the caller to retry.
+		 */
+		SOCKBUF_UNLOCK(sb);
+
+		n = m_get(M_WAITOK, MT_DATA);
+
+		SOCKBUF_LOCK(sb);
+		if (sb->sb_mtls != top) {
+			m_free(n);
+			return (NULL);
+		}
+	}
+	n->m_flags |= M_NOTREADY;
+
+	/* Store remainder in 'n'. */
+	n->m_len = m->m_len - remain;
+	if (m->m_flags & M_EXT) {
+		n->m_data = m->m_data + remain;
+		mb_dupcl(n, m);
+	} else {
+		bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
+	}
+
+	/* Trim 'm' and update accounting. */
+	m->m_len -= n->m_len;
+	sb->sb_tlscc -= n->m_len;
+	sb->sb_ccc -= n->m_len;
+
+	/* Account for 'n'. */
+	sballoc_ktls_rx(sb, n);
+
+	/* Insert 'n' into the TLS chain. */
+	sb->sb_mtls = n;
+	n->m_next = m->m_next;
+	if (sb->sb_mtlstail == m)
+		sb->sb_mtlstail = n;
+
+	/* Detach the record from the TLS chain. */
+	m->m_next = NULL;
+
+out:
+	MPASS(m_length(top, NULL) == len);
+	for (m = top; m != NULL; m = m->m_next)
+		sbfree_ktls_rx(sb, m);
+	sb->sb_tlsdcc = len;
+	sb->sb_ccc += len;
+	SBCHECK(sb);
+	return (top);
+}
+
+static int
+m_segments(struct mbuf *m, int skip)
+{
+	int count;
+
+	while (skip >= m->m_len) {
+		skip -= m->m_len;
+		m = m->m_next;
+	}
+
+	for (count = 0; m != NULL; count++)
+		m = m->m_next;
+	return (count);
+}
+
+static void
+ktls_decrypt(struct socket *so)
+{
+	char tls_header[MBUF_PEXT_HDR_LEN];
+	struct ktls_session *tls;
+	struct sockbuf *sb;
+	struct tls_record_layer *hdr;
+	struct iovec *iov;
+	struct tls_get_record tgr;
+	struct mbuf *control, *data, *m;
+	uint64_t seqno;
+	int error, i, iov_cap, iov_count, remain, tls_len, trail_len;
+
+	hdr = (struct tls_record_layer *)tls_header;
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
+	    ("%s: socket %p not running", __func__, so));
+
+	tls = sb->sb_tls_info;
+	MPASS(tls != NULL);
+
+	iov = NULL;
+	iov_cap = 0;
+	for (;;) {
+		/* Is there enough queued for a TLS header? */
+		if (sb->sb_tlscc < tls->params.tls_hlen)
+			break;
+
+		m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
+		tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
+
+		if (hdr->tls_vmajor != tls->params.tls_vmajor ||
+		    hdr->tls_vminor != tls->params.tls_vminor)
+			error = EINVAL;
+		else if (tls_len < tls->params.tls_hlen || tls_len >
+		    tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
+		    tls->params.tls_tlen)
+			error = EMSGSIZE;
+		else
+			error = 0;
+		if (__predict_false(error != 0)) {
+			/*
+			 * We have a corrupted record and are likely
+			 * out of sync.  The connection isn't
+			 * recoverable at this point, so abort it.
+			 */
+			SOCKBUF_UNLOCK(sb);
+			counter_u64_add(ktls_offload_corrupted_records, 1);
+
+			CURVNET_SET(so->so_vnet);
+			so->so_proto->pr_usrreqs->pru_abort(so);
+			so->so_error = error;
+			CURVNET_RESTORE();
+			goto deref;
+		}
+
+		/* Is the entire record queued? */
+		if (sb->sb_tlscc < tls_len)
+			break;
+
+		/*
+		 * Split out the portion of the mbuf chain containing
+		 * this TLS record.
+		 */
+		data = ktls_detach_record(sb, tls_len);
+		if (data == NULL)
+			continue;
+		MPASS(sb->sb_tlsdcc == tls_len);
+
+		seqno = sb->sb_tls_seqno;
+		sb->sb_tls_seqno++;
+		SBCHECK(sb);
+		SOCKBUF_UNLOCK(sb);
+
+		/*
+		 * Build an I/O vector spanning the TLS record payload
+		 * and trailer but skipping the header.
+		 */
+		iov_count = m_segments(data, tls->params.tls_hlen);
+		if (iov_count > iov_cap) {
+			free(iov, M_KTLS);
+			iov = malloc(sizeof(*iov) * iov_count, M_KTLS,
+			    M_WAITOK);
+			iov_cap = iov_count;
+		}
+		remain = tls->params.tls_hlen;
+		for (m = data; remain >= m->m_len; m = m->m_next)
+			remain -= m->m_len;
+		iov[0].iov_base = m->m_data + remain;
+		iov[0].iov_len = m->m_len - remain;
+		for (m = m->m_next, i = 1; m != NULL; m = m->m_next, i++) {
+			iov[i].iov_base = m->m_data;
+			iov[i].iov_len = m->m_len;
+		}
+		MPASS(i == iov_count);
+
+		error = tls->sw_decrypt(tls, hdr, iov, iov_count, seqno,
+		    &trail_len);
+		if (error) {
+			counter_u64_add(ktls_offload_failed_crypto, 1);
+
+			SOCKBUF_LOCK(sb);
+			if (sb->sb_tlsdcc == 0) {
+				/*
+				 * sbcut/drop/flush discarded these
+				 * mbufs.
+				 */
+				m_freem(data);
+				break;
+			}
+
+			/*
+			 * Drop this TLS record's data, but keep
+			 * decrypting subsequent records.
+			 */
+			sb->sb_ccc -= tls_len;
+			sb->sb_tlsdcc = 0;
+
+			CURVNET_SET(so->so_vnet);
+			so->so_error = EBADMSG;
+			sorwakeup_locked(so);
+			CURVNET_RESTORE();
+
+			m_freem(data);
+
+			SOCKBUF_LOCK(sb);
+			continue;
+		}
+
+		/* Allocate the control mbuf. */
+		tgr.tls_type = hdr->tls_type;
+		tgr.tls_vmajor = hdr->tls_vmajor;
+		tgr.tls_vminor = hdr->tls_vminor;
+		tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
+		    trail_len);
+		control = sbcreatecontrol_how(&tgr, sizeof(tgr),
+		    TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
+
+		SOCKBUF_LOCK(sb);
+		if (sb->sb_tlsdcc == 0) {
+			/* sbcut/drop/flush discarded these mbufs. */
+			MPASS(sb->sb_tlscc == 0);
+			m_freem(data);
+			m_freem(control);
+			break;
+		}
+
+		/*
+		 * Clear the 'dcc' accounting in preparation for
+		 * adding the decrypted record.
+		 */
+		sb->sb_ccc -= tls_len;
+		sb->sb_tlsdcc = 0;
+		SBCHECK(sb);
+
+		/* If there is no payload, drop all of the data. */
+		if (tgr.tls_length == htobe16(0)) {
+			m_freem(data);
+			data = NULL;
+		} else {
+			/* Trim header. */
+			remain = tls->params.tls_hlen;
+			while (remain > 0) {
+				if (data->m_len > remain) {
+					data->m_data += remain;
+					data->m_len -= remain;
+					break;
+				}
+				remain -= data->m_len;
+				data = m_free(data);
+			}
+
+			/* Trim trailer and clear M_NOTREADY. */
+			remain = be16toh(tgr.tls_length);
+			m = data;
+			for (m = data; remain > m->m_len; m = m->m_next) {
+				m->m_flags &= ~M_NOTREADY;
+				remain -= m->m_len;
+			}
+			m->m_len = remain;
+			m_freem(m->m_next);
+			m->m_next = NULL;
+			m->m_flags &= ~M_NOTREADY;
+
+			/* Set EOR on the final mbuf. */
+			m->m_flags |= M_EOR;
+		}
+
+		sbappendcontrol_locked(sb, data, control, 0);
+	}
+
+	sb->sb_flags &= ~SB_TLS_RX_RUNNING;
+
+	if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
+		so->so_error = EMSGSIZE;
+
+	sorwakeup_locked(so);
+
+deref:
+	SOCKBUF_UNLOCK_ASSERT(sb);
+
+	CURVNET_SET(so->so_vnet);
+	SOCK_LOCK(so);
+	sorele(so);
+	CURVNET_RESTORE();
+}
+
+void
 ktls_enqueue_to_free(struct mbuf_ext_pgs *pgs)
 {
 	struct ktls_wq *wq;
@@ -1439,7 +1883,7 @@ ktls_enqueue_to_free(struct mbuf_ext_pgs *pgs)
 	pgs->mbuf = NULL;
 	wq = &ktls_wq[pgs->tls->wq_index];
 	mtx_lock(&wq->mtx);
-	STAILQ_INSERT_TAIL(&wq->head, pgs, stailq);
+	STAILQ_INSERT_TAIL(&wq->pgs_head, pgs, stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
@@ -1473,12 +1917,12 @@ ktls_enqueue(struct mbuf *m, struct socket *so, int pa
 
 	wq = &ktls_wq[pgs->tls->wq_index];
 	mtx_lock(&wq->mtx);
-	STAILQ_INSERT_TAIL(&wq->head, pgs, stailq);
+	STAILQ_INSERT_TAIL(&wq->pgs_head, pgs, stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
-	counter_u64_add(ktls_cnt_on, 1);
+	counter_u64_add(ktls_cnt_tx_queued, 1);
 }
 
 static __noinline void
@@ -1633,36 +2077,46 @@ static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
-	struct mbuf_ext_pgs *p, *n;
+	struct mbuf_ext_pgs *p, *pn;
+	struct socket *so, *son;
 	struct ktls_session *tls;
 	struct mbuf *m;
-	STAILQ_HEAD(, mbuf_ext_pgs) local_head;
+	STAILQ_HEAD(, mbuf_ext_pgs) local_pgs_head;
+	STAILQ_HEAD(, socket) local_so_head;
 
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
-		while (STAILQ_EMPTY(&wq->head)) {
+		while (STAILQ_EMPTY(&wq->pgs_head) &&
+		    STAILQ_EMPTY(&wq->so_head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
-		STAILQ_INIT(&local_head);
-		STAILQ_CONCAT(&local_head, &wq->head);
+		STAILQ_INIT(&local_pgs_head);
+		STAILQ_CONCAT(&local_pgs_head, &wq->pgs_head);
+		STAILQ_INIT(&local_so_head);
+		STAILQ_CONCAT(&local_so_head, &wq->so_head);
 		mtx_unlock(&wq->mtx);
 
-		STAILQ_FOREACH_SAFE(p, &local_head, stailq, n) {
+		STAILQ_FOREACH_SAFE(p, &local_pgs_head, stailq, pn) {
 			if (p->mbuf != NULL) {
 				ktls_encrypt(p);
-				counter_u64_add(ktls_cnt_on, -1);
+				counter_u64_add(ktls_cnt_tx_queued, -1);
 			} else {
 				tls = p->tls;
 				ktls_free(tls);
 				m = __containerof(p, struct mbuf, m_ext_pgs);
 				uma_zfree(zone_mbuf, m);
 			}
+		}
+
+		STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
+			ktls_decrypt(so);
+			counter_u64_add(ktls_cnt_rx_queued, -1);
 		}
 	}
 }

Modified: projects/nfs-over-tls/sys/kern/uipc_sockbuf.c
==============================================================================
--- projects/nfs-over-tls/sys/kern/uipc_sockbuf.c	Sun May  3 00:37:16 2020	(r360583)
+++ projects/nfs-over-tls/sys/kern/uipc_sockbuf.c	Sun May  3 00:46:14 2020	(r360584)
@@ -70,6 +70,8 @@ u_long sb_max_adj =
 
 static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
 
+static void	sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m,
+    struct mbuf *n);
 static struct mbuf	*sbcut_internal(struct sockbuf *sb, int len);
 static void	sbflush_internal(struct sockbuf *sb);
 
@@ -339,7 +341,52 @@ sbfree(struct sockbuf *sb, struct mbuf *m)
 		sb->sb_sndptroff -= m->m_len;
 }
 
+#ifdef KERN_TLS
 /*
+ * Similar to sballoc/sbfree but does not adjust state associated with
+ * the sb_mb chain such a sb_fnrdy or sb_sndptr*.  Also assumes mbufs
+ * are not ready.
+ */
+void
+sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sb->sb_ccc += m->m_len;
+	sb->sb_tlscc += m->m_len;
+
+	sb->sb_mbcnt += MSIZE;
+	sb->sb_mcnt += 1;
+
+	if (m->m_flags & M_EXT) {
+		sb->sb_mbcnt += m->m_ext.ext_size;
+		sb->sb_ccnt += 1;
+	}
+}
+
+void
+sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m)
+{
+
+#if 0	/* XXX: not yet: soclose() call path comes here w/o lock. */
+	SOCKBUF_LOCK_ASSERT(sb);
+#endif
+
+	sb->sb_ccc -= m->m_len;
+	sb->sb_tlscc -= m->m_len;
+
+	sb->sb_mbcnt -= MSIZE;
+	sb->sb_mcnt -= 1;
+
+	if (m->m_flags & M_EXT) {
+		sb->sb_mbcnt -= m->m_ext.ext_size;
+		sb->sb_ccnt -= 1;
+	}
+}
+#endif
+
+/*
  * Socantsendmore indicates that no more data will be sent on the socket; it
  * would normally be applied to a socket when the user informs the system
  * that no more data is to be sent, by the protocol code (in case
@@ -375,6 +422,10 @@ socantrcvmore_locked(struct socket *so)
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+#ifdef KERN_TLS
+	if (so->so_rcv.sb_flags & SB_TLS_RX)
+		ktls_check_rx(&so->so_rcv);
+#endif
 	sorwakeup_locked(so);
 	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
 }
@@ -775,6 +826,24 @@ sblastmbufchk(struct sockbuf *sb, const char *file, in
 		}
 		panic("%s from %s:%u", __func__, file, line);
 	}
+
+#ifdef KERN_TLS
+	m = sb->sb_mtls;
+	while (m && m->m_next)
+		m = m->m_next;
+
+	if (m != sb->sb_mtlstail) {
+		printf("%s: sb_mtls %p sb_mtlstail %p last %p\n",
+			__func__, sb->sb_mtls, sb->sb_mtlstail, m);
+		printf("TLS packet tree:\n");
+		printf("\t");
+		for (m = sb->sb_mtls; m != NULL; m = m->m_next) {
+			printf("%p ", m);
+		}
+		printf("\n");
+		panic("%s from %s:%u", __func__, file, line);
+	}
+#endif
 }
 #endif /* SOCKBUF_DEBUG */
 
@@ -852,7 +921,30 @@ sbappend(struct sockbuf *sb, struct mbuf *m, int flags
 	SOCKBUF_UNLOCK(sb);
 }
 
+#ifdef KERN_TLS
 /*
+ * Append an mbuf containing encrypted TLS data.  The data
+ * is marked M_NOTREADY until it has been decrypted and
+ * stored as a TLS record.
+ */
+static void
+sbappend_ktls_rx(struct sockbuf *sb, struct mbuf *m)
+{
+	struct mbuf *n;
+
+	SBLASTMBUFCHK(sb);
+
+	/* Remove all packet headers and mbuf tags to get a pure data chain. */
+	m_demote(m, 1, 0);
+
+	for (n = m; n != NULL; n = n->m_next)
+		n->m_flags |= M_NOTREADY;
+	sbcompress_ktls_rx(sb, m, sb->sb_mtlstail);
+	ktls_check_rx(sb);
+}
+#endif
+
+/*
  * This version of sbappend() should only be used when the caller absolutely
  * knows that there will never be more than one record in the socket buffer,
  * that is, a stream protocol (such as TCP).
@@ -863,6 +955,19 @@ sbappendstream_locked(struct sockbuf *sb, struct mbuf 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
+
+#ifdef KERN_TLS
+	/*
+	 * Decrypted TLS records are appended as records via
+	 * sbappendrecord().  TCP passes encrypted TLS records to this
+	 * function which must be scheduled for decryption.
+	 */
+	if (sb->sb_flags & SB_TLS_RX) {
+		sbappend_ktls_rx(sb, m);
+		return;
+	}
+#endif
+
 	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
 
 	SBLASTMBUFCHK(sb);
@@ -901,6 +1006,9 @@ sbcheck(struct sockbuf *sb, const char *file, int line
 {
 	struct mbuf *m, *n, *fnrdy;
 	u_long acc, ccc, mbcnt;
+#ifdef KERN_TLS
+	u_long tlscc;
+#endif
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
@@ -936,9 +1044,46 @@ sbcheck(struct sockbuf *sb, const char *file, int line
 			mbcnt += m->m_ext.ext_size;
 	    }
 	}
+#ifdef KERN_TLS
+	/*
+	 * Account for mbufs "detached" by ktls_detach_record() while
+	 * they are decrypted by ktls_decrypt().  tlsdcc gives a count
+	 * of the detached bytes that are included in ccc.  The mbufs
+	 * and clusters are not included in the socket buffer
+	 * accounting.
+	 */
+	ccc += sb->sb_tlsdcc;
+
+	tlscc = 0;
+	for (m = sb->sb_mtls; m; m = m->m_next) {
+		if (m->m_nextpkt != NULL) {
+			printf("sb %p TLS mbuf %p with nextpkt\n", sb, m);
+			goto fail;
+		}
+		if ((m->m_flags & M_NOTREADY) == 0) {
+			printf("sb %p TLS mbuf %p ready\n", sb, m);
+			goto fail;
+		}
+		tlscc += m->m_len;
+		ccc += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+	}
+
+	if (sb->sb_tlscc != tlscc) {
+		printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc,
+		    sb->sb_tlsdcc);
+		goto fail;
+	}
+#endif
 	if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) {
 		printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n",
 		    acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt);
+#ifdef KERN_TLS
+		printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc,
+		    sb->sb_tlsdcc);
+#endif
 		goto fail;
 	}
 	return;
@@ -1214,14 +1359,72 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct 
 	SBLASTMBUFCHK(sb);
 }
 
+#ifdef KERN_TLS
 /*
+ * A version of sbcompress() for encrypted TLS RX mbufs.  These mbufs
+ * are appended to the 'sb_mtls' chain instead of 'sb_mb' and are also
+ * a bit simpler (no EOR markers, always MT_DATA, etc.).
+ */
+static void
+sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	while (m) {
+		KASSERT((m->m_flags & M_EOR) == 0,
+		    ("TLS RX mbuf %p with EOR", m));
+		KASSERT(m->m_type == MT_DATA,
+		    ("TLS RX mbuf %p is not MT_DATA", m));
+		KASSERT((m->m_flags & M_NOTREADY) != 0,
+		    ("TLS RX mbuf %p ready", m));
+		KASSERT((m->m_flags & M_NOMAP) == 0,
+		    ("TLS RX mbuf %p unmapped", m));
+
+		if (m->m_len == 0) {
+			m = m_free(m);
+			continue;
+		}
+
+		/*
+		 * Even though both 'n' and 'm' are NOTREADY, it's ok
+		 * to coalesce the data.
+		 */
+		if (n &&
+		    M_WRITABLE(n) &&
+		    ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
+		    !(n->m_flags & (M_NOMAP)) &&
+		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+		    m->m_len <= M_TRAILINGSPACE(n)) {
+			m_copydata(m, 0, m->m_len, mtodo(n, n->m_len));
+			n->m_len += m->m_len;
+			sb->sb_ccc += m->m_len;
+			sb->sb_tlscc += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mtls = m;
+		sb->sb_mtlstail = m;
+		sballoc_ktls_rx(sb, m);
+		n = m;
+		m = m->m_next;
+		n->m_next = NULL;
+	}
+	SBLASTMBUFCHK(sb);
+}
+#endif
+
+/*
  * Free all mbufs in a sockbuf.  Check that all resources are reclaimed.
  */
 static void
 sbflush_internal(struct sockbuf *sb)
 {
 
-	while (sb->sb_mbcnt) {
+	while (sb->sb_mbcnt || sb->sb_tlsdcc) {
 		/*
 		 * Don't call sbcut(sb, 0) if the leading mbuf is non-empty:
 		 * we would loop forever. Panic instead.
@@ -1259,6 +1462,7 @@ static struct mbuf *
 sbcut_internal(struct sockbuf *sb, int len)
 {
 	struct mbuf *m, *next, *mfree;
+	bool is_tls;
 
 	KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0",
 	    __func__, len));
@@ -1266,10 +1470,25 @@ sbcut_internal(struct sockbuf *sb, int len)
 	    __func__, len, sb->sb_ccc));
 
 	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	is_tls = false;
 	mfree = NULL;
 
 	while (len > 0) {
 		if (m == NULL) {
+#ifdef KERN_TLS
+			if (next == NULL && !is_tls) {
+				if (sb->sb_tlsdcc != 0) {
+					MPASS(len >= sb->sb_tlsdcc);
+					len -= sb->sb_tlsdcc;
+					sb->sb_ccc -= sb->sb_tlsdcc;
+					sb->sb_tlsdcc = 0;
+					if (len == 0)
+						break;
+				}
+				next = sb->sb_mtls;
+				is_tls = true;
+			}
+#endif
 			KASSERT(next, ("%s: no next, len %d", __func__, len));
 			m = next;
 			next = m->m_nextpkt;
@@ -1288,12 +1507,17 @@ sbcut_internal(struct sockbuf *sb, int len)
 			break;
 		}
 		len -= m->m_len;
-		sbfree(sb, m);
+#ifdef KERN_TLS
+		if (is_tls)
+			sbfree_ktls_rx(sb, m);
+		else
+#endif
+			sbfree(sb, m);
 		/*
 		 * Do not put M_NOTREADY buffers to the free list, they
 		 * are referenced from outside.
 		 */
-		if (m->m_flags & M_NOTREADY)
+		if (m->m_flags & M_NOTREADY && !is_tls)
 			m = m->m_next;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-projects mailing list