git: a6d768d845c1 - main - netmap: add kernel support for the "offsets" feature

Vincenzo Maffione vmaffione at FreeBSD.org
Mon Mar 29 16:29:42 UTC 2021


The branch main has been updated by vmaffione:

URL: https://cgit.FreeBSD.org/src/commit/?id=a6d768d845c173823785c71bb18b40074e7a8998

commit a6d768d845c173823785c71bb18b40074e7a8998
Author:     Vincenzo Maffione <vmaffione at FreeBSD.org>
AuthorDate: 2021-03-29 16:22:48 +0000
Commit:     Vincenzo Maffione <vmaffione at FreeBSD.org>
CommitDate: 2021-03-29 16:29:01 +0000

    netmap: add kernel support for the "offsets" feature
    
    This feature enables applications to ask netmap to transmit or
    receive packets starting at a user-specified offset from the
    beginning of the netmap buffer. This is meant to ease those
    packet manipulation operations such as pushing or popping packet
    headers, that may be useful to implement software switches,
    routers and other packet processors.
    To use the feature, drivers (e.g., iflib, vtnet, etc.) must have
    explicit support. This change does not add support for any driver,
    but introduces the necessary kernel changes. However, offsets support
    is already included for VALE ports and pipes.
---
 sys/dev/netmap/netmap.c         | 258 ++++++++++++++++++++++++++++++---
 sys/dev/netmap/netmap_bdg.c     | 248 ++++++++++++++++++++++++++++----
 sys/dev/netmap/netmap_bdg.h     |   2 +
 sys/dev/netmap/netmap_freebsd.c |   2 +-
 sys/dev/netmap/netmap_generic.c |   2 +-
 sys/dev/netmap/netmap_kern.h    | 119 +++++++++++++++-
 sys/dev/netmap/netmap_mem2.c    | 305 +++++++++++++++++++++++++++-------------
 sys/dev/netmap/netmap_mem2.h    |   2 +-
 sys/dev/netmap/netmap_null.c    |   1 +
 sys/dev/netmap/netmap_pipe.c    |  46 +++---
 sys/dev/netmap/netmap_vale.c    | 193 +++++--------------------
 sys/net/netmap.h                |  55 ++++++++
 sys/net/netmap_legacy.h         |   2 +
 sys/net/netmap_user.h           |  24 +++-
 sys/net/netmap_virt.h           |   4 +-
 15 files changed, 935 insertions(+), 328 deletions(-)

diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index f9698096b47a..18de5ef205eb 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c
@@ -805,6 +805,14 @@ netmap_update_config(struct netmap_adapter *na)
 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
 
+static int
+netmap_default_bufcfg(struct netmap_kring *kring, uint64_t target)
+{
+	kring->hwbuf_len = target;
+	kring->buf_align = 0; /* no alignment */
+	return 0;
+}
+
 /* create the krings array and initialize the fields common to all adapters.
  * The array layout is this:
  *
@@ -885,12 +893,16 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
 			kring->nr_pending_mode = NKR_NETMAP_OFF;
 			if (i < nma_get_nrings(na, t)) {
 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
+				kring->nm_bufcfg = na->nm_bufcfg;
+				if (kring->nm_bufcfg == NULL)
+					kring->nm_bufcfg = netmap_default_bufcfg;
 			} else {
 				if (!(na->na_flags & NAF_HOST_RINGS))
 					kring->nr_kflags |= NKR_FAKERING;
 				kring->nm_sync = (t == NR_TX ?
 						netmap_txsync_to_host:
 						netmap_rxsync_from_host);
+				kring->nm_bufcfg = netmap_default_bufcfg;
 			}
 			kring->nm_notify = na->nm_notify;
 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
@@ -969,17 +981,24 @@ netmap_hw_krings_delete(struct netmap_adapter *na)
 	netmap_krings_delete(na);
 }
 
+void
+netmap_mem_restore(struct netmap_adapter *na)
+{
+	if (na->nm_mem_prev) {
+		netmap_mem_put(na->nm_mem);
+		na->nm_mem = na->nm_mem_prev;
+		na->nm_mem_prev = NULL;
+	}
+}
+
 static void
 netmap_mem_drop(struct netmap_adapter *na)
 {
-	int last = netmap_mem_deref(na->nm_mem, na);
 	/* if the native allocator had been overrided on regif,
 	 * restore it now and drop the temporary one
 	 */
-	if (last && na->nm_mem_prev) {
-		netmap_mem_put(na->nm_mem);
-		na->nm_mem = na->nm_mem_prev;
-		na->nm_mem_prev = NULL;
+	if (netmap_mem_deref(na->nm_mem, na)) {
+		netmap_mem_restore(na);
 	}
 }
 
@@ -1571,7 +1590,7 @@ netmap_get_na(struct nmreq_header *hdr,
 	if (error || *na != NULL)
 		goto out;
 
-	/* try to see if this is a bridge port */
+	/* try to see if this is a vale port */
 	error = netmap_get_vale_na(hdr, na, nmd, create);
 	if (error)
 		goto out;
@@ -2232,6 +2251,198 @@ netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
 	return 0;
 }
 
+/* Handle the offset option, if present in the hdr.
+ * Returns 0 on success, or an error.
+ */
+static int
+netmap_offsets_init(struct netmap_priv_d *priv, struct nmreq_header *hdr)
+{
+	struct nmreq_opt_offsets *opt;
+	struct netmap_adapter *na = priv->np_na;
+	struct netmap_kring *kring;
+	uint64_t mask = 0, bits = 0, maxbits = sizeof(uint64_t) * 8,
+		 max_offset = 0, initial_offset = 0, min_gap = 0;
+	u_int i;
+	enum txrx t;
+	int error = 0;
+
+	opt = (struct nmreq_opt_offsets *)
+		nmreq_getoption(hdr, NETMAP_REQ_OPT_OFFSETS);
+	if (opt == NULL)
+		return 0;
+
+	if (!(na->na_flags & NAF_OFFSETS)) {
+		if (netmap_verbose)
+			nm_prerr("%s does not support offsets",
+				na->name);
+		error = EOPNOTSUPP;
+		goto out;
+	}
+
+	/* check sanity of the opt values */
+	max_offset = opt->nro_max_offset;
+	min_gap = opt->nro_min_gap;
+	initial_offset = opt->nro_initial_offset;
+	bits = opt->nro_offset_bits;
+
+	if (bits > maxbits) {
+		if (netmap_verbose)
+			nm_prerr("bits: %llu too large (max %llu)",
+				(unsigned long long)bits,
+				(unsigned long long)maxbits);
+		error = EINVAL;
+		goto out;
+	}
+	/* we take bits == 0 as a request to use the entire field */
+	if (bits == 0 || bits == maxbits) {
+		/* shifting a type by sizeof(type) is undefined */
+		bits = maxbits;
+		mask = 0xffffffffffffffff;
+	} else {
+		mask = (1ULL << bits) - 1;
+	}
+	if (max_offset > NETMAP_BUF_SIZE(na)) {
+		if (netmap_verbose)
+			nm_prerr("max offset %llu > buf size %u",
+				(unsigned long long)max_offset, NETMAP_BUF_SIZE(na));
+		error = EINVAL;
+		goto out;
+	}
+	if ((max_offset & mask) != max_offset) {
+		if (netmap_verbose)
+			nm_prerr("max offset %llu to large for %llu bits",
+				(unsigned long long)max_offset,
+				(unsigned long long)bits);
+		error = EINVAL;
+		goto out;
+	}
+	if (initial_offset > max_offset) {
+		if (netmap_verbose)
+			nm_prerr("initial offset %llu > max offset %llu",
+				(unsigned long long)initial_offset,
+				(unsigned long long)max_offset);
+		error = EINVAL;
+		goto out;
+	}
+
+	/* initialize the kring and ring fields. */
+	foreach_selected_ring(priv, t, i, kring) {
+		struct netmap_kring *kring = NMR(na, t)[i];
+		struct netmap_ring *ring = kring->ring;
+		u_int j;
+
+		/* it the ring is already in use we check that the
+		 * new request is compatible with the existing one
+		 */
+		if (kring->offset_mask) {
+			if ((kring->offset_mask & mask) != mask ||
+			     kring->offset_max < max_offset) {
+				if (netmap_verbose)
+					nm_prinf("%s: cannot increase"
+						 "offset mask and/or max"
+						 "(current: mask=%llx,max=%llu",
+							kring->name,
+							(unsigned long long)kring->offset_mask,
+							(unsigned long long)kring->offset_max);
+				error = EBUSY;
+				goto out;
+			}
+			mask = kring->offset_mask;
+			max_offset = kring->offset_max;
+		} else {
+			kring->offset_mask = mask;
+			*(uint64_t *)(uintptr_t)&ring->offset_mask = mask;
+			kring->offset_max = max_offset;
+			kring->offset_gap = min_gap;
+		}
+
+		/* if there is an initial offset, put it into
+		 * all the slots
+		 *
+		 * Note: we cannot change the offsets if the
+		 * ring is already in use.
+		 */
+		if (!initial_offset || kring->users > 1)
+			continue;
+
+		for (j = 0; j < kring->nkr_num_slots; j++) {
+			struct netmap_slot *slot = ring->slot + j;
+
+			nm_write_offset(kring, slot, initial_offset);
+		}
+	}
+
+out:
+	opt->nro_opt.nro_status = error;
+	if (!error) {
+		opt->nro_max_offset = max_offset;
+	}
+	return error;
+
+}
+
+static int
+netmap_compute_buf_len(struct netmap_priv_d *priv)
+{
+	enum txrx t;
+	u_int i;
+	struct netmap_kring *kring;
+	int error = 0;
+	unsigned mtu = 0;
+	struct netmap_adapter *na = priv->np_na;
+	uint64_t target, maxframe;
+
+	if (na->ifp != NULL)
+		mtu = nm_os_ifnet_mtu(na->ifp);
+
+	foreach_selected_ring(priv, t, i, kring) {
+
+		if (kring->users > 1)
+			continue;
+
+		target = NETMAP_BUF_SIZE(kring->na) -
+			kring->offset_max;
+		if (!kring->offset_gap)
+			kring->offset_gap =
+				NETMAP_BUF_SIZE(kring->na);
+		if (kring->offset_gap < target)
+			target = kring->offset_gap;
+
+		if (mtu) {
+			maxframe = mtu + ETH_HLEN +
+				ETH_FCS_LEN + VLAN_HLEN;
+			if (maxframe < target) {
+				target = kring->offset_gap;
+			}
+		}
+
+		error = kring->nm_bufcfg(kring, target);
+		if (error)
+			goto out;
+
+		*(uint64_t *)(uintptr_t)&kring->ring->buf_align = kring->buf_align;
+
+		if (mtu && t == NR_RX && kring->hwbuf_len < mtu) {
+			if (!(na->na_flags & NAF_MOREFRAG)) {
+				nm_prerr("error: large MTU (%d) needed "
+					 "but %s does not support "
+					 "NS_MOREFRAG", mtu,
+					 na->name);
+				error = EINVAL;
+				goto out;
+			} else {
+				nm_prinf("info: netmap application on "
+					 "%s needs to support "
+					 "NS_MOREFRAG "
+					 "(MTU=%u,buf_size=%llu)",
+					 kring->name, mtu,
+					 (unsigned long long)kring->hwbuf_len);
+			}
+		}
+	}
+out:
+	return error;
+}
 
 /*
  * possibly move the interface to netmap-mode.
@@ -2381,6 +2592,16 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
 	if (error)
 		goto err_rel_excl;
 
+	/* initialize offsets if requested */
+	error = netmap_offsets_init(priv, hdr);
+	if (error)
+		goto err_rel_excl;
+
+	/* compute and validate the buf lenghts */
+	error = netmap_compute_buf_len(priv);
+	if (error)
+		goto err_rel_excl;
+
 	/* in all cases, create a new netmap if */
 	nifp = netmap_mem_if_new(na, priv);
 	if (nifp == NULL) {
@@ -2713,17 +2934,12 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
 		}
 #ifdef WITH_VALE
 		case NETMAP_REQ_VALE_ATTACH: {
-			error = netmap_vale_attach(hdr, NULL /* userspace request */);
+			error = netmap_bdg_attach(hdr, NULL /* userspace request */);
 			break;
 		}
 
 		case NETMAP_REQ_VALE_DETACH: {
-			error = netmap_vale_detach(hdr, NULL /* userspace request */);
-			break;
-		}
-
-		case NETMAP_REQ_VALE_LIST: {
-			error = netmap_vale_list(hdr);
+			error = netmap_bdg_detach(hdr, NULL /* userspace request */);
 			break;
 		}
 
@@ -2795,6 +3011,11 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
 			break;
 		}
 
+		case NETMAP_REQ_VALE_LIST: {
+			error = netmap_vale_list(hdr);
+			break;
+		}
+
 		case NETMAP_REQ_VALE_NEWIF: {
 			error = nm_vi_create(hdr);
 			break;
@@ -2804,13 +3025,13 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
 			error = nm_vi_destroy(hdr->nr_name);
 			break;
 		}
+#endif  /* WITH_VALE */
 
 		case NETMAP_REQ_VALE_POLLING_ENABLE:
 		case NETMAP_REQ_VALE_POLLING_DISABLE: {
 			error = nm_bdg_polling(hdr);
 			break;
 		}
-#endif  /* WITH_VALE */
 		case NETMAP_REQ_POOLS_INFO_GET: {
 			/* Get information from the memory allocator used for
 			 * hdr->nr_name. */
@@ -3029,6 +3250,9 @@ nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
 	case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
 		rv = sizeof(struct nmreq_opt_sync_kloop_mode);
 		break;
+	case NETMAP_REQ_OPT_OFFSETS:
+		rv = sizeof(struct nmreq_opt_offsets);
+		break;
 	}
 	/* subtract the common header */
 	return rv - sizeof(struct nmreq_option);
@@ -3733,16 +3957,14 @@ netmap_attach_common(struct netmap_adapter *na)
 	na->active_fds = 0;
 
 	if (na->nm_mem == NULL) {
-		/* use the global allocator */
-		na->nm_mem = netmap_mem_get(&nm_mem);
+		/* use iommu or global allocator */
+		na->nm_mem = netmap_mem_get_iommu(na);
 	}
-#ifdef WITH_VALE
 	if (na->nm_bdg_attach == NULL)
 		/* no special nm_bdg_attach callback. On VALE
 		 * attach, we need to interpose a bwrap
 		 */
 		na->nm_bdg_attach = netmap_default_bdg_attach;
-#endif
 
 	return 0;
 }
diff --git a/sys/dev/netmap/netmap_bdg.c b/sys/dev/netmap/netmap_bdg.c
index 4d18859e2091..57659f3a7a6e 100644
--- a/sys/dev/netmap/netmap_bdg.c
+++ b/sys/dev/netmap/netmap_bdg.c
@@ -540,6 +540,85 @@ out:
 	return error;
 }
 
+/* Process NETMAP_REQ_VALE_ATTACH.
+ */
+int
+netmap_bdg_attach(struct nmreq_header *hdr, void *auth_token)
+{
+	struct nmreq_vale_attach *req =
+		(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
+	struct netmap_vp_adapter * vpna;
+	struct netmap_adapter *na = NULL;
+	struct netmap_mem_d *nmd = NULL;
+	struct nm_bridge *b = NULL;
+	int error;
+
+	NMG_LOCK();
+	/* permission check for modified bridges */
+	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
+	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
+		error = EACCES;
+		goto unlock_exit;
+	}
+
+	if (req->reg.nr_mem_id) {
+		nmd = netmap_mem_find(req->reg.nr_mem_id);
+		if (nmd == NULL) {
+			error = EINVAL;
+			goto unlock_exit;
+		}
+	}
+
+	/* check for existing one */
+	error = netmap_get_vale_na(hdr, &na, nmd, 0);
+	if (na) {
+		error = EBUSY;
+		goto unref_exit;
+	}
+	error = netmap_get_vale_na(hdr, &na,
+				nmd, 1 /* create if not exists */);
+	if (error) { /* no device */
+		goto unlock_exit;
+	}
+
+	if (na == NULL) { /* VALE prefix missing */
+		error = EINVAL;
+		goto unlock_exit;
+	}
+
+	if (NETMAP_OWNED_BY_ANY(na)) {
+		error = EBUSY;
+		goto unref_exit;
+	}
+
+	if (na->nm_bdg_ctl) {
+		/* nop for VALE ports. The bwrap needs to put the hwna
+		 * in netmap mode (see netmap_bwrap_bdg_ctl)
+		 */
+		error = na->nm_bdg_ctl(hdr, na);
+		if (error)
+			goto unref_exit;
+		nm_prdis("registered %s to netmap-mode", na->name);
+	}
+	vpna = (struct netmap_vp_adapter *)na;
+	req->port_index = vpna->bdg_port;
+
+	if (nmd)
+		netmap_mem_put(nmd);
+
+	NMG_UNLOCK();
+	return 0;
+
+unref_exit:
+	netmap_adapter_put(na);
+unlock_exit:
+	if (nmd)
+		netmap_mem_put(nmd);
+
+	NMG_UNLOCK();
+	return error;
+}
+
 
 int
 nm_is_bwrap(struct netmap_adapter *na)
@@ -547,6 +626,74 @@ nm_is_bwrap(struct netmap_adapter *na)
 	return na->nm_register == netmap_bwrap_reg;
 }
 
+/* Process NETMAP_REQ_VALE_DETACH.
+ */
+int
+netmap_bdg_detach(struct nmreq_header *hdr, void *auth_token)
+{
+	int error;
+
+	NMG_LOCK();
+	error = netmap_bdg_detach_locked(hdr, auth_token);
+	NMG_UNLOCK();
+	return error;
+}
+
+int
+netmap_bdg_detach_locked(struct nmreq_header *hdr, void *auth_token)
+{
+	struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
+	struct netmap_vp_adapter *vpna;
+	struct netmap_adapter *na;
+	struct nm_bridge *b = NULL;
+	int error;
+
+	/* permission check for modified bridges */
+	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
+	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
+		error = EACCES;
+		goto error_exit;
+	}
+
+	error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
+	if (error) { /* no device, or another bridge or user owns the device */
+		goto error_exit;
+	}
+
+	if (na == NULL) { /* VALE prefix missing */
+		error = EINVAL;
+		goto error_exit;
+	} else if (nm_is_bwrap(na) &&
+		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
+		/* Don't detach a NIC with polling */
+		error = EBUSY;
+		goto unref_exit;
+	}
+
+	vpna = (struct netmap_vp_adapter *)na;
+	if (na->na_vp != vpna) {
+		/* trying to detach first attach of VALE persistent port attached
+		 * to 2 bridges
+		 */
+		error = EBUSY;
+		goto unref_exit;
+	}
+	nmreq_det->port_index = vpna->bdg_port;
+
+	if (na->nm_bdg_ctl) {
+		/* remove the port from bridge. The bwrap
+		 * also needs to put the hwna in normal mode
+		 */
+		error = na->nm_bdg_ctl(hdr, na);
+	}
+
+unref_exit:
+	netmap_adapter_put(na);
+error_exit:
+	return error;
+
+}
+
 
 struct nm_bdg_polling_state;
 struct
@@ -1092,7 +1239,7 @@ netmap_bwrap_dtor(struct netmap_adapter *na)
  * hwna rx ring.
  * The bridge wrapper then sends the packets through the bridge.
  */
-static int
+int
 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
@@ -1217,7 +1364,7 @@ netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
 		/* intercept the hwna nm_nofify callback on the hw rings */
 		for (i = 0; i < hwna->num_rx_rings; i++) {
 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
-			hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
+			hwna->rx_rings[i]->nm_notify = bna->nm_intr_notify;
 		}
 		i = hwna->num_rx_rings; /* for safety */
 		/* save the host ring notify unconditionally */
@@ -1250,12 +1397,6 @@ netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
 		hwna->na_lut.objtotal = 0;
 		hwna->na_lut.objsize = 0;
 
-		/* pass ownership of the netmap rings to the hwna */
-		for_rx_tx(t) {
-			for (i = 0; i < netmap_all_rings(na, t); i++) {
-				NMR(na, t)[i]->ring = NULL;
-			}
-		}
 		/* reset the number of host rings to default */
 		for_rx_tx(t) {
 			nma_set_host_nrings(hwna, t, 1);
@@ -1275,6 +1416,11 @@ netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
 	struct netmap_adapter *hwna = bna->hwna;
 	int error;
 
+	/* cache the lut in the embedded host adapter */
+	error = netmap_mem_get_lut(hwna->nm_mem, &bna->host.up.na_lut);
+	if (error)
+		return error;
+
 	/* Forward the request to the hwna. It may happen that nobody
 	 * registered hwna yet, so netmap_mem_get_lut() may have not
 	 * been called yet. */
@@ -1289,9 +1435,69 @@ netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
 	info->num_rx_descs = hwna->num_tx_desc;
 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
 
+	if (na->na_flags & NAF_HOST_RINGS) {
+		struct netmap_adapter *hostna = &bna->host.up;
+		enum txrx t;
+
+		/* limit the number of host rings to that of hw */
+		if (na->na_flags & NAF_HOST_ALL) {
+			hostna->num_tx_rings = nma_get_nrings(hwna, NR_RX);
+			hostna->num_rx_rings = nma_get_nrings(hwna, NR_TX);
+		} else {
+			nm_bound_var(&hostna->num_tx_rings, 1, 1,
+				nma_get_nrings(hwna, NR_TX), NULL);
+			nm_bound_var(&hostna->num_rx_rings, 1, 1,
+				nma_get_nrings(hwna, NR_RX), NULL);
+		}
+		for_rx_tx(t) {
+			enum txrx r = nm_txrx_swap(t);
+			u_int nr = nma_get_nrings(hostna, t);
+
+			nma_set_host_nrings(na, t, nr);
+			if (nma_get_host_nrings(hwna, t) < nr) {
+				nma_set_host_nrings(hwna, t, nr);
+			}
+			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
+		}
+	}
+
 	return 0;
 }
 
+/* nm_bufcfg callback for bwrap */
+static int
+netmap_bwrap_bufcfg(struct netmap_kring *kring, uint64_t target)
+{
+	struct netmap_adapter *na = kring->na;
+	struct netmap_bwrap_adapter *bna =
+		(struct netmap_bwrap_adapter *)na;
+	struct netmap_adapter *hwna = bna->hwna;
+	struct netmap_kring *hwkring;
+	enum txrx r;
+	int error;
+
+	/* we need the hw kring that corresponds to the bwrap one:
+	 * remember that rx and tx are swapped
+	 */
+	r = nm_txrx_swap(kring->tx);
+	hwkring = NMR(hwna, r)[kring->ring_id];
+
+	/* copy down the offset information, forward the request
+	 * and copy up the results
+	 */
+	hwkring->offset_mask = kring->offset_mask;
+	hwkring->offset_max  = kring->offset_max;
+	hwkring->offset_gap  = kring->offset_gap;
+
+	error = hwkring->nm_bufcfg(hwkring, target);
+	if (error)
+		return error;
+
+	kring->hwbuf_len = hwkring->hwbuf_len;
+	kring->buf_align = hwkring->buf_align;
+
+	return 0;
+}
 
 /* nm_krings_create callback for bwrap */
 int
@@ -1314,6 +1520,9 @@ netmap_bwrap_krings_create_common(struct netmap_adapter *na)
 	for_rx_tx(t) {
 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
 			NMR(hwna, t)[i]->users++;
+			/* this to prevent deletion of the rings through
+			 * our krings, instead of through the hwna ones */
+			NMR(na, t)[i]->nr_kflags |= NKR_NEEDRING;
 		}
 	}
 
@@ -1355,6 +1564,7 @@ err_dec_users:
 	for_rx_tx(t) {
 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
 			NMR(hwna, t)[i]->users--;
+			NMR(na, t)[i]->users--;
 		}
 	}
 	hwna->nm_krings_delete(hwna);
@@ -1377,6 +1587,7 @@ netmap_bwrap_krings_delete_common(struct netmap_adapter *na)
 	for_rx_tx(t) {
 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
 			NMR(hwna, t)[i]->users--;
+			NMR(na, t)[i]->users--;
 		}
 	}
 
@@ -1480,6 +1691,7 @@ netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
 		error = netmap_do_regif(npriv, na, hdr);
 		if (error) {
 			netmap_priv_delete(npriv);
+			netmap_mem_restore(bna->hwna);
 			return error;
 		}
 		bna->na_kpriv = npriv;
@@ -1490,6 +1702,7 @@ netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
 		netmap_priv_delete(bna->na_kpriv);
 		bna->na_kpriv = NULL;
 		na->na_flags &= ~NAF_BUSY;
+		netmap_mem_restore(bna->hwna);
 	}
 
 	return error;
@@ -1527,6 +1740,7 @@ netmap_bwrap_attach_common(struct netmap_adapter *na,
 	}
 	na->nm_dtor = netmap_bwrap_dtor;
 	na->nm_config = netmap_bwrap_config;
+	na->nm_bufcfg = netmap_bwrap_bufcfg;
 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
 	na->pdev = hwna->pdev;
 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
@@ -1546,25 +1760,8 @@ netmap_bwrap_attach_common(struct netmap_adapter *na,
 		na->na_flags |= NAF_HOST_RINGS;
 		hostna = &bna->host.up;
 
-		/* limit the number of host rings to that of hw */
-		nm_bound_var(&hostna->num_tx_rings, 1, 1,
-				nma_get_nrings(hwna, NR_TX), NULL);
-		nm_bound_var(&hostna->num_rx_rings, 1, 1,
-				nma_get_nrings(hwna, NR_RX), NULL);
-
 		snprintf(hostna->name, sizeof(hostna->name), "%s^", na->name);
 		hostna->ifp = hwna->ifp;
-		for_rx_tx(t) {
-			enum txrx r = nm_txrx_swap(t);
-			u_int nr = nma_get_nrings(hostna, t);
-
-			nma_set_nrings(hostna, t, nr);
-			nma_set_host_nrings(na, t, nr);
-			if (nma_get_host_nrings(hwna, t) < nr) {
-				nma_set_host_nrings(hwna, t, nr);
-			}
-			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
-		}
 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
@@ -1574,6 +1771,7 @@ netmap_bwrap_attach_common(struct netmap_adapter *na,
 			hostna->na_hostvp = &bna->host;
 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
+		/* bwrap_config() will determine the number of host rings */
 	}
 	if (hwna->na_flags & NAF_MOREFRAG)
 		na->na_flags |= NAF_MOREFRAG;
diff --git a/sys/dev/netmap/netmap_bdg.h b/sys/dev/netmap/netmap_bdg.h
index e4683885e66c..a88eaf11b07c 100644
--- a/sys/dev/netmap/netmap_bdg.h
+++ b/sys/dev/netmap/netmap_bdg.h
@@ -178,8 +178,10 @@ int netmap_bdg_free(struct nm_bridge *b);
 void netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw);
 int netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na);
 int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
+int netmap_bdg_detach_locked(struct nmreq_header *hdr, void *auth_token);
 int netmap_vp_reg(struct netmap_adapter *na, int onoff);
 int netmap_vp_rxsync(struct netmap_kring *kring, int flags);
+int netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags);
 int netmap_bwrap_notify(struct netmap_kring *kring, int flags);
 int netmap_bwrap_attach_common(struct netmap_adapter *na,
 		struct netmap_adapter *hwna);
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index 2cedea4440fe..a47cb508de04 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -1057,7 +1057,7 @@ netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
 		vm_page_replace(page, object, (*mres)->pindex, *mres);
 		*mres = page;
 	}
-	vm_page_valid(page);
+	page->valid = VM_PAGE_BITS_ALL;
 	return (VM_PAGER_OK);
 }
 
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index 09ba550cae92..f999576736fb 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -106,7 +106,7 @@ __FBSDID("$FreeBSD$");
 static inline struct mbuf *
 nm_os_get_mbuf(struct ifnet *ifp, int len)
 {
-	return alloc_skb(ifp->needed_headroom + len +
+	return alloc_skb(LL_RESERVED_SPACE(ifp) + len +
 			 ifp->needed_tailroom, GFP_ATOMIC);
 }
 
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index fd9db5842df3..d9ae6a4f2054 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -459,8 +459,16 @@ struct netmap_kring {
 	 * On a NIC reset, the NIC ring indexes may be reset but the
 	 * indexes in the netmap rings remain the same. nkr_hwofs
 	 * keeps track of the offset between the two.
+	 *
+	 * Moreover, during reset, we can restore only the subset of
+	 * the NIC ring that corresponds to the kernel-owned part of
+	 * the netmap ring. The rest of the slots must be restored
+	 * by the *sync routines when the user releases more slots.
+	 * The nkr_to_refill field keeps track of the number of slots
+	 * that still need to be restored.
 	 */
 	int32_t		nkr_hwofs;
+	int32_t		nkr_to_refill;
 
 	/* last_reclaim is opaque marker to help reduce the frequency
 	 * of operations such as reclaiming tx buffers. A possible use
@@ -535,6 +543,36 @@ struct netmap_kring {
 	uint32_t pipe_tail;		/* hwtail updated by the other end */
 #endif /* WITH_PIPES */
 
+	/* mask for the offset-related part of the ptr field in the slots */
+	uint64_t offset_mask;
+	/* maximum user-specified offset, as stipulated at bind time.
+	 * Larger offset requests will be silently capped to offset_max.
+	 */
+	uint64_t offset_max;
+	/* minimum gap between two consecutive offsets into the same
+	 * buffer, as stipulated at bind time. This is used to choose
+	 * the hwbuf_len, but is not otherwise checked for compliance
+	 * at runtime.
+	 */
+	uint64_t offset_gap;
+
+	/* size of hardware buffer. This may be less than the size of
+	 * the netmap buffers because of non-zero offsets, or because
+	 * the netmap buffer size exceeds the capability of the hardware.
+	 */
+	uint64_t hwbuf_len;
+
+	/* required aligment (in bytes) for the buffers used by this ring.
+	 * Netmap buffers are aligned to cachelines, which should suffice
+	 * for most NICs. If the user is passing offsets, though, we need
+	 * to check that the resulting buf address complies with any
+	 * alignment restriction.
+	 */
+	uint64_t buf_align;
+
+	/* harware specific logic for the selection of the hwbuf_len */
+	int (*nm_bufcfg)(struct netmap_kring *kring, uint64_t target);
+
 	int (*save_notify)(struct netmap_kring *kring, int flags);
 
 #ifdef WITH_MONITOR
@@ -719,6 +757,8 @@ struct netmap_adapter {
 #define NAF_FORCE_NATIVE 128	/* the adapter is always NATIVE */
 /* free */
 #define NAF_MOREFRAG	512	/* the adapter supports NS_MOREFRAG */
+#define NAF_OFFSETS	1024	/* the adapter supports the slot offsets */
+#define NAF_HOST_ALL	2048	/* the adapter wants as many host rings as hw */
 #define NAF_ZOMBIE	(1U<<30) /* the nic driver has been unloaded */
 #define	NAF_BUSY	(1U<<31) /* the adapter is used internally and
 				  * cannot be registered from userspace
@@ -782,6 +822,22 @@ struct netmap_adapter {
 	 * nm_config() returns configuration information from the OS
 	 *	Called with NMG_LOCK held.
 	 *
+	 * nm_bufcfg()
+	 *      the purpose of this callback is to fill the kring->hwbuf_len
+	 *      (l) and kring->buf_align fields. The l value is most important
+	 *      for RX rings, where we want to disallow writes outside of the
+	 *      netmap buffer. The l value must be computed taking into account
+	 *      the stipulated max_offset (o), possibily increased if there are
+	 *      alignment constraints, the maxframe (m), if known, and the
+	 *      current NETMAP_BUF_SIZE (b) of the memory region used by the
+	 *      adapter. We want the largest supported l such that o + l <= b.
+	 *      If m is known to be <= b - o, the callback may also choose the
+	 *      largest l <= b, ignoring the offset.  The buf_align field is
+	 *      most important for TX rings when there are offsets.  The user
+	 *      will see this value in the ring->buf_align field.  Misaligned
+	 *      offsets will cause the corresponding packets to be silently
+	 *      dropped.
+	 *
 	 * nm_krings_create() create and init the tx_rings and
 	 * 	rx_rings arrays of kring structures. In particular,
 	 * 	set the nm_sync callbacks for each ring.
@@ -811,6 +867,7 @@ struct netmap_adapter {
 	int (*nm_txsync)(struct netmap_kring *kring, int flags);
 	int (*nm_rxsync)(struct netmap_kring *kring, int flags);
 	int (*nm_notify)(struct netmap_kring *kring, int flags);
+	int (*nm_bufcfg)(struct netmap_kring *kring, uint64_t target);
 #define NAF_FORCE_READ      1
 #define NAF_FORCE_RECLAIM   2
 #define NAF_CAN_FORWARD_DOWN 4
@@ -1096,12 +1153,13 @@ struct netmap_bwrap_adapter {
 	 * here its original value, to be restored at detach
 	 */
 	struct netmap_vp_adapter *saved_na_vp;
+	int (*nm_intr_notify)(struct netmap_kring *kring, int flags);
 };
 int nm_bdg_polling(struct nmreq_header *hdr);
 
+int netmap_bdg_attach(struct nmreq_header *hdr, void *auth_token);
+int netmap_bdg_detach(struct nmreq_header *hdr, void *auth_token);
 #ifdef WITH_VALE
-int netmap_vale_attach(struct nmreq_header *hdr, void *auth_token);
-int netmap_vale_detach(struct nmreq_header *hdr, void *auth_token);
 int netmap_vale_list(struct nmreq_header *hdr);
 int netmap_vi_create(struct nmreq_header *hdr, int);
 int nm_vi_create(struct nmreq_header *);
@@ -1431,6 +1489,12 @@ uint32_t nm_rxsync_prologue(struct netmap_kring *, struct netmap_ring *);
 	} while (0)
 #endif
 
+#define NM_CHECK_ADDR_LEN_OFF(na_, l_, o_) do {				\
+	if ((l_) + (o_) < (l_) || 					\
+	    (l_) + (o_) > NETMAP_BUF_SIZE(na_)) {			\
+		(l_) = NETMAP_BUF_SIZE(na_) - (o_);			\
+	} } while (0)
+
 
 /*---------------------------------------------------------------*/
 /*
@@ -1493,6 +1557,7 @@ int netmap_get_na(struct nmreq_header *hdr, struct netmap_adapter **na,
 void netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp);
 int netmap_get_hw_na(struct ifnet *ifp,
 		struct netmap_mem_d *nmd, struct netmap_adapter **na);
+void netmap_mem_restore(struct netmap_adapter *na);
 
 #ifdef WITH_VALE
 uint32_t netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
@@ -1680,7 +1745,7 @@ extern int netmap_generic_txqdisc;
 
 /* Assigns the device IOMMU domain to an allocator.
  * Returns -ENOMEM in case the domain is different */
-#define nm_iommu_group_id(dev) (0)
+#define nm_iommu_group_id(dev) (-1)
 
 /* Callback invoked by the dma machinery after a successful dmamap_load */
 static void netmap_dmamap_cb(__unused void *arg,
@@ -1890,6 +1955,9 @@ struct plut_entry {
 
 struct netmap_obj_pool;
 
+/* alignment for netmap buffers */
+#define NM_BUF_ALIGN	64
+
 /*
  * NMB return the virtual address of a buffer (buffer 0 on bad index)
  * PNMB also fills the physical address
@@ -1919,6 +1987,40 @@ PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp)
 	return ret;
 }
 
+static inline void
+nm_write_offset(struct netmap_kring *kring,
+		struct netmap_slot *slot, uint64_t offset)
+{
+	slot->ptr = (slot->ptr & ~kring->offset_mask) |
+		(offset & kring->offset_mask);
+}
+
+static inline uint64_t
+nm_get_offset(struct netmap_kring *kring, struct netmap_slot *slot)
+{
+	uint64_t offset = (slot->ptr & kring->offset_mask);
+	if (unlikely(offset > kring->offset_max))
+		offset = kring->offset_max;
+	return offset;
+}
+
+static inline void *
+NMB_O(struct netmap_kring *kring, struct netmap_slot *slot)
*** 1433 LINES SKIPPED ***


More information about the dev-commits-src-main mailing list