git: ba6b771d1732 - stable/13 - ktls: Ensure FIFO encryption order for TLS 1.0.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Tue, 23 Nov 2021 23:12:55 UTC
The branch stable/13 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=ba6b771d1732eda0546d187b1397b1bcded3208d

commit ba6b771d1732eda0546d187b1397b1bcded3208d
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2021-10-13 19:30:15 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2021-11-23 23:11:44 +0000

    ktls: Ensure FIFO encryption order for TLS 1.0.
    
    TLS 1.0 records are encrypted as one continuous CBC chain where the
    last block of the previous record is used as the IV for the next
    record.  As a result, TLS 1.0 records cannot be encrypted out of order
    but must be encrypted as a FIFO.
    
    If the later pages of a sendfile(2) request complete before the first
    pages, then TLS records can be encrypted out of order.  For TLS 1.1
    and later this is fine, but this can break for TLS 1.0.
    
    To cope, add a queue in each TLS session to hold TLS records that
    contain valid unencrypted data but are waiting for an earlier TLS
    record to be encrypted first.
    
    - In ktls_enqueue(), check if a TLS record being queued is the next
      record expected for a TLS 1.0 session.  If not, it is placed in
      sorted order in the pending_records queue in the TLS session.
    
      If it is the next expected record, queue it for SW encryption like
      normal.  In addition, check if this new record (really a potential
      batch of records) was holding up any previously queued records in
      the pending_records queue.  Any of those records that are now in
      order are also placed on the queue for SW encryption.
    
    - In ktls_destroy(), free any TLS records on the pending_records
      queue.  These mbufs are marked M_NOTREADY so were not freed when the
      socket buffer was purged in sbdestroy().  Instead, they must be
      freed explicitly.
    
    Reviewed by:    gallatin, markj
    Sponsored by:   Netflix
    Differential Revision:  https://reviews.freebsd.org/D32381
    
    (cherry picked from commit 9f03d2c00167c8047416e0048e3b7f89d73baf8e)
---
 sys/kern/uipc_ktls.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++--
 sys/sys/ktls.h       |   5 +++
 2 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 73915600779c..e9585d06841c 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -138,6 +138,11 @@ static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
+static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD,
+    &ktls_cnt_tx_pending,
+    "Number of TLS 1.0 records waiting for earlier TLS records");
+
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
     &ktls_cnt_tx_queued,
@@ -574,6 +579,9 @@ ktls_create_session(struct socket *so, struct tls_enable *en,
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
+				tls->sequential_records = true;
+				tls->next_seqno = be64dec(en->rec_seq);
+				STAILQ_INIT(&tls->pending_records);
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
@@ -1481,6 +1489,20 @@ ktls_destroy(struct ktls_session *tls)
 {
 	struct rm_priotracker prio;
 
+	if (tls->sequential_records) {
+		struct mbuf *m, *n;
+		int page_count;
+
+		STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) {
+			page_count = m->m_epg_enc_cnt;
+			while (page_count > 0) {
+				KASSERT(page_count >= m->m_epg_nrdy,
+				    ("%s: too few pages", __func__));
+				page_count -= m->m_epg_nrdy;
+				m = m_free(m);
+			}
+		}
+	}
 	ktls_cleanup(tls);
 	if (tls->be != NULL && ktls_allow_unload) {
 		rm_rlock(&ktls_backends_lock, &prio);
@@ -1982,10 +2004,29 @@ ktls_enqueue_to_free(struct mbuf *m)
 		wakeup(wq);
 }
 
+/* Number of TLS records in a batch passed to ktls_enqueue(). */
+static u_int
+ktls_batched_records(struct mbuf *m)
+{
+	int page_count, records;
+
+	records = 0;
+	page_count = m->m_epg_enc_cnt;
+	while (page_count > 0) {
+		records++;
+		page_count -= m->m_epg_nrdy;
+		m = m->m_next;
+	}
+	KASSERT(page_count == 0, ("%s: mismatched page count", __func__));
+	return (records);
+}
+
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
+	struct ktls_session *tls;
 	struct ktls_wq *wq;
+	int queued;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
@@ -2003,14 +2044,80 @@ ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 	 */
 	m->m_epg_so = so;
 
-	wq = &ktls_wq[m->m_epg_tls->wq_index];
+	queued = 1;
+	tls = m->m_epg_tls;
+	wq = &ktls_wq[tls->wq_index];
 	mtx_lock(&wq->mtx);
-	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
+	if (__predict_false(tls->sequential_records)) {
+		/*
+		 * For TLS 1.0, records must be encrypted
+		 * sequentially.  For a given connection, all records
+		 * queued to the associated work queue are processed
+		 * sequentially.  However, sendfile(2) might complete
+		 * I/O requests spanning multiple TLS records out of
+		 * order.  Here we ensure TLS records are enqueued to
+		 * the work queue in FIFO order.
+		 *
+		 * tls->next_seqno holds the sequence number of the
+		 * next TLS record that should be enqueued to the work
+		 * queue.  If this next record is not tls->next_seqno,
+		 * it must be a future record, so insert it, sorted by
+		 * TLS sequence number, into tls->pending_records and
+		 * return.
+		 *
+		 * If this TLS record matches tls->next_seqno, place
+		 * it in the work queue and then check
+		 * tls->pending_records to see if any
+		 * previously-queued records are now ready for
+		 * encryption.
+		 */
+		if (m->m_epg_seqno != tls->next_seqno) {
+			struct mbuf *n, *p;
+
+			p = NULL;
+			STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) {
+				if (n->m_epg_seqno > m->m_epg_seqno)
+					break;
+				p = n;
+			}
+			if (n == NULL)
+				STAILQ_INSERT_TAIL(&tls->pending_records, m,
+				    m_epg_stailq);
+			else if (p == NULL)
+				STAILQ_INSERT_HEAD(&tls->pending_records, m,
+				    m_epg_stailq);
+			else
+				STAILQ_INSERT_AFTER(&tls->pending_records, p, m,
+				    m_epg_stailq);
+			mtx_unlock(&wq->mtx);
+			counter_u64_add(ktls_cnt_tx_pending, 1);
+			return;
+		}
+
+		tls->next_seqno += ktls_batched_records(m);
+		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
+
+		while (!STAILQ_EMPTY(&tls->pending_records)) {
+			struct mbuf *n;
+
+			n = STAILQ_FIRST(&tls->pending_records);
+			if (n->m_epg_seqno != tls->next_seqno)
+				break;
+
+			queued++;
+			STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq);
+			tls->next_seqno += ktls_batched_records(n);
+			STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq);
+		}
+		counter_u64_add(ktls_cnt_tx_pending, -(queued - 1));
+	} else
+		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
+
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
-	counter_u64_add(ktls_cnt_tx_queued, 1);
+	counter_u64_add(ktls_cnt_tx_queued, queued);
 }
 
 static __noinline void
diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
index 3cde75f9edf6..e9cae34035c6 100644
--- a/sys/sys/ktls.h
+++ b/sys/sys/ktls.h
@@ -208,6 +208,11 @@ struct ktls_session {
 	struct task reset_tag_task;
 	struct inpcb *inp;
 	bool reset_pending;
+	bool sequential_records;
+
+	/* Only used for TLS 1.0. */
+	uint64_t next_seqno;
+	STAILQ_HEAD(, mbuf) pending_records;
 } __aligned(CACHE_LINE_SIZE);
 
 void ktls_check_rx(struct sockbuf *sb);