svn commit: r293719 - head/sys/dev/hyperv/netvsc

Sepherosa Ziehau sephe at FreeBSD.org
Tue Jan 12 01:30:53 UTC 2016


Author: sephe
Date: Tue Jan 12 01:30:51 2016
New Revision: 293719
URL: https://svnweb.freebsd.org/changeset/base/293719

Log:
  hyperv/hn: Implement LRO
  
  - Implement the LRO using tcp_lro APIs, and LRO is enabled by default.
  - Add several stats sysctl nodes.
  - Check IP/TCP length before sending the packet to tcp_lro_rx(), if host
    does not provide RX csum information (*); and add an option through
    sysctl to always trust host TCP segment csum checks (default is off).
  - Add sysctl to control the LRO entry depth; it is disabled by default.
    It is used to avoid holding too much TCP segments in driver.  Limiting
    the LRO entry depth helps a lot in a one/two streams RX test.
  
  This one 3x the RX performance on my local test (3Gbps -> 10Gbps), and
  ~2x the RX performance over a directly connected 40Ge network (5Gbps ->
  9Gbps).
  
  (*) It seems the host stops supplying csum information, once the network
  load is high.  This still needs investigation...
  
  Reviewed by:		Hongjiang Zhang <honzhan microsoft com>,
  			Dexuan Cui <decui microsoft com>,
  			Jun Su <junsu microsoft com>,
  			delphij
  Tested by:		me (local),
  			Hongjiang Zhang <honzhan microsoft com>
  			(directly connected 40Ge)
  Approved by:		delphij (mentor), adrian (mentor, no objection)
  With feedback from:	delphij, Hongjiang Zhang <honzhan microsoft com>
  Sponsored by:		Microsoft OSTC
  Differential Revision:	https://reviews.freebsd.org/D4824

Modified:
  head/sys/dev/hyperv/netvsc/hv_net_vsc.c
  head/sys/dev/hyperv/netvsc/hv_net_vsc.h
  head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
  head/sys/dev/hyperv/netvsc/hv_rndis.h
  head/sys/dev/hyperv/netvsc/hv_rndis_filter.c
  head/sys/dev/hyperv/netvsc/hv_rndis_filter.h

Modified: head/sys/dev/hyperv/netvsc/hv_net_vsc.c
==============================================================================
--- head/sys/dev/hyperv/netvsc/hv_net_vsc.c	Tue Jan 12 01:23:45 2016	(r293718)
+++ head/sys/dev/hyperv/netvsc/hv_net_vsc.c	Tue Jan 12 01:30:51 2016	(r293719)
@@ -919,6 +919,7 @@ hv_nv_on_receive(netvsc_dev *net_dev, st
 	 */
 	hv_nv_on_receive_completion(device, vm_xfer_page_pkt->d.transaction_id,
 	    status);
+	hv_rf_receive_rollup(net_dev);
 }
 
 /*

Modified: head/sys/dev/hyperv/netvsc/hv_net_vsc.h
==============================================================================
--- head/sys/dev/hyperv/netvsc/hv_net_vsc.h	Tue Jan 12 01:23:45 2016	(r293718)
+++ head/sys/dev/hyperv/netvsc/hv_net_vsc.h	Tue Jan 12 01:30:51 2016	(r293719)
@@ -43,6 +43,8 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/sx.h>
+#include <netinet/in.h>
+#include <netinet/tcp_lro.h>
 
 #include <dev/hyperv/include/hyperv.h>
 
@@ -993,6 +995,17 @@ typedef struct hn_softc {
 	int             temp_unusable;
 	struct hv_device  *hn_dev_obj;
 	netvsc_dev  	*net_dev;
+
+	struct lro_ctrl	hn_lro;
+	int		hn_lro_hiwat;
+
+	/* Trust tcp segments verification on host side */
+	int		hn_trust_hosttcp;
+
+	u_long		hn_csum_ip;
+	u_long		hn_csum_tcp;
+	u_long		hn_csum_trusted;
+	u_long		hn_lro_tried;
 } hn_softc_t;
 
 

Modified: head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
==============================================================================
--- head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	Tue Jan 12 01:23:45 2016	(r293718)
+++ head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	Tue Jan 12 01:30:51 2016	(r293719)
@@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
+#include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_arp.h>
@@ -138,6 +139,15 @@ __FBSDID("$FreeBSD$");
     CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP|		\
     CSUM_IP6_TSO|CSUM_IP6_ISCSI)
 
+/* XXX move to netinet/tcp_lro.h */
+#define HN_LRO_HIWAT_MAX				65535
+#define HN_LRO_HIWAT_DEF				HN_LRO_HIWAT_MAX
+/* YYY 2*MTU is a bit rough, but should be good enough. */
+#define HN_LRO_HIWAT_MTULIM(ifp)			(2 * (ifp)->if_mtu)
+#define HN_LRO_HIWAT_ISVALID(sc, hiwat)			\
+    ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) ||	\
+     (hiwat) <= HN_LRO_HIWAT_MAX)
+
 /*
  * Data types
  */
@@ -171,6 +181,9 @@ int hv_promisc_mode = 0;    /* normal mo
 /* The one and only one */
 static struct hv_netvsc_driver_context g_netvsc_drv;
 
+/* Trust tcp segements verification on host side. */
+static int hn_trust_hosttcp = 0;
+TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp);
 
 /*
  * Forward declarations
@@ -181,6 +194,19 @@ static void hn_ifinit(void *xsc);
 static int  hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
 static int  hn_start_locked(struct ifnet *ifp);
 static void hn_start(struct ifnet *ifp);
+#ifdef HN_LRO_HIWAT
+static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
+#endif
+static int hn_check_iplen(const struct mbuf *, int);
+
+static __inline void
+hn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
+{
+	sc->hn_lro_hiwat = hiwat;
+#ifdef HN_LRO_HIWAT
+	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
+#endif
+}
 
 /*
  * NetVsc get message transport protocol type 
@@ -310,6 +336,8 @@ netvsc_attach(device_t dev)
 	hn_softc_t *sc;
 	int unit = device_get_unit(dev);
 	struct ifnet *ifp;
+	struct sysctl_oid_list *child;
+	struct sysctl_ctx_list *ctx;
 	int ret;
 
 	netvsc_init();
@@ -322,6 +350,8 @@ netvsc_attach(device_t dev)
 	bzero(sc, sizeof(hn_softc_t));
 	sc->hn_unit = unit;
 	sc->hn_dev = dev;
+	sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
+	sc->hn_trust_hosttcp = hn_trust_hosttcp;
 
 	NV_LOCK_INIT(sc, "NetVSCLock");
 
@@ -349,9 +379,11 @@ netvsc_attach(device_t dev)
 	 */
 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
 	ifp->if_capabilities |=
-	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO;
+	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
+	    IFCAP_LRO;
 	ifp->if_capenable |=
-	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO;
+	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
+	    IFCAP_LRO;
 	/*
 	 * Only enable UDP checksum offloading when it is on 2012R2 or
 	 * later. UDP checksum offloading doesn't work on earlier
@@ -372,8 +404,59 @@ netvsc_attach(device_t dev)
 		sc->hn_carrier = 1;
 	}
 
+	tcp_lro_init(&sc->hn_lro);
+	/* Driver private LRO settings */
+	sc->hn_lro.ifp = ifp;
+#ifdef HN_LRO_HIWAT
+	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
+#endif
+
 	ether_ifattach(ifp, device_info.mac_addr);
 
+	ctx = device_get_sysctl_ctx(dev);
+	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
+
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued",
+	    CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued");
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed",
+	    CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
+	    CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");
+#ifdef HN_LRO_HIWAT
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat",
+	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl,
+	    "I", "LRO high watermark");
+#endif
+	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp",
+	    CTLFLAG_RW, &sc->hn_trust_hosttcp, 0,
+	    "Trust tcp segement verification on host side, "
+	    "when csum info is missing");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip",
+	    CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp",
+	    CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted",
+	    CTLFLAG_RW, &sc->hn_csum_trusted,
+	    "# of TCP segements that we trust host's csum verification");
+
+	if (unit == 0) {
+		struct sysctl_ctx_list *dc_ctx;
+		struct sysctl_oid_list *dc_child;
+		devclass_t dc;
+
+		/*
+		 * Add sysctl nodes for devclass
+		 */
+		dc = device_get_devclass(dev);
+		dc_ctx = devclass_get_sysctl_ctx(dc);
+		dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc));
+
+		SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp",
+		    CTLFLAG_RD, &hn_trust_hosttcp, 0,
+		    "Trust tcp segement verification on host side, "
+		    "when csum info is missing (global setting)");
+	}
+
 	return (0);
 }
 
@@ -383,6 +466,7 @@ netvsc_attach(device_t dev)
 static int
 netvsc_detach(device_t dev)
 {
+	struct hn_softc *sc = device_get_softc(dev);
 	struct hv_device *hv_device = vmbus_get_devctx(dev); 
 
 	if (bootverbose)
@@ -401,6 +485,8 @@ netvsc_detach(device_t dev)
 
 	hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
 
+	tcp_lro_free(&sc->hn_lro);
+
 	return (0);
 }
 
@@ -887,7 +973,7 @@ netvsc_recv(struct hv_device *device_ctx
 	struct mbuf *m_new;
 	struct ifnet *ifp;
 	device_t dev = device_ctx->device;
-	int size;
+	int size, do_lro = 0;
 
 	if (sc == NULL) {
 		return (0); /* TODO: KYS how can this be! */
@@ -938,6 +1024,7 @@ netvsc_recv(struct hv_device *device_ctx
 		if (csum_info->receive.ip_csum_succeeded) {
 			m_new->m_pkthdr.csum_flags |=
 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
+			sc->hn_csum_ip++;
 		}
 
 		/* TCP csum offload */
@@ -945,9 +1032,50 @@ netvsc_recv(struct hv_device *device_ctx
 			m_new->m_pkthdr.csum_flags |=
 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			m_new->m_pkthdr.csum_data = 0xffff;
+			sc->hn_csum_tcp++;
 		}
-	}
 
+		if (csum_info->receive.ip_csum_succeeded &&
+		    csum_info->receive.tcp_csum_succeeded)
+			do_lro = 1;
+	} else {
+		const struct ether_header *eh;
+		uint16_t etype;
+		int hoff;
+
+		hoff = sizeof(*eh);
+		if (m_new->m_len < hoff)
+			goto skip;
+		eh = mtod(m_new, struct ether_header *);
+		etype = ntohs(eh->ether_type);
+		if (etype == ETHERTYPE_VLAN) {
+			const struct ether_vlan_header *evl;
+
+			hoff = sizeof(*evl);
+			if (m_new->m_len < hoff)
+				goto skip;
+			evl = mtod(m_new, struct ether_vlan_header *);
+			etype = ntohs(evl->evl_proto);
+		}
+
+		if (etype == ETHERTYPE_IP) {
+			int pr;
+
+			pr = hn_check_iplen(m_new, hoff);
+			if (pr == IPPROTO_TCP) {
+				if (sc->hn_trust_hosttcp) {
+					sc->hn_csum_trusted++;
+					m_new->m_pkthdr.csum_flags |=
+					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
+					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+					m_new->m_pkthdr.csum_data = 0xffff;
+				}
+				/* Rely on SW csum verification though... */
+				do_lro = 1;
+			}
+		}
+	}
+skip:
 	if ((packet->vlan_tci != 0) &&
 	    (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) {
 		m_new->m_pkthdr.ether_vtag = packet->vlan_tci;
@@ -961,12 +1089,37 @@ netvsc_recv(struct hv_device *device_ctx
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 
+	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
+		struct lro_ctrl *lro = &sc->hn_lro;
+
+		if (lro->lro_cnt) {
+			sc->hn_lro_tried++;
+			if (tcp_lro_rx(lro, m_new, 0) == 0) {
+				/* DONE! */
+				return 0;
+			}
+		}
+	}
+
 	/* We're not holding the lock here, so don't release it */
 	(*ifp->if_input)(ifp, m_new);
 
 	return (0);
 }
 
+void
+netvsc_recv_rollup(struct hv_device *device_ctx)
+{
+	hn_softc_t *sc = device_get_softc(device_ctx->device);
+	struct lro_ctrl *lro = &sc->hn_lro;
+	struct lro_entry *queued;
+
+	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
+		SLIST_REMOVE_HEAD(&lro->lro_active, next);
+		tcp_lro_flush(lro, queued);
+	}
+}
+
 /*
  * Rules for using sc->temp_unusable:
  * 1.  sc->temp_unusable can only be read or written while holding NV_LOCK()
@@ -1022,7 +1175,13 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, 
 
 		/* Obtain and record requested MTU */
 		ifp->if_mtu = ifr->ifr_mtu;
- 		
+		/*
+		 * Make sure that LRO high watermark is still valid,
+		 * after MTU change (the 2*MTU limit).
+		 */
+		if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat))
+			hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp));
+
 		do {
 			NV_LOCK(sc);
 			if (!sc->temp_unusable) {
@@ -1147,6 +1306,8 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, 
 				ifp->if_capenable |= IFCAP_RXCSUM;
 			}
 		}
+		if (mask & IFCAP_LRO)
+			ifp->if_capenable ^= IFCAP_LRO;
 
 		if (mask & IFCAP_TSO4) {
 			ifp->if_capenable ^= IFCAP_TSO4;
@@ -1292,6 +1453,102 @@ hn_watchdog(struct ifnet *ifp)
 }
 #endif
 
+#ifdef HN_LRO_HIWAT
+static int
+hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct hn_softc *sc = arg1;
+	int hiwat, error;
+
+	hiwat = sc->hn_lro_hiwat;
+	error = sysctl_handle_int(oidp, &hiwat, 0, req);
+	if (error || req->newptr == NULL)
+		return error;
+
+	if (!HN_LRO_HIWAT_ISVALID(sc, hiwat))
+		return EINVAL;
+
+	if (sc->hn_lro_hiwat != hiwat)
+		hn_set_lro_hiwat(sc, hiwat);
+	return 0;
+}
+#endif	/* HN_LRO_HIWAT */
+
+static int
+hn_check_iplen(const struct mbuf *m, int hoff)
+{
+	const struct ip *ip;
+	int len, iphlen, iplen;
+	const struct tcphdr *th;
+	int thoff;				/* TCP data offset */
+
+	len = hoff + sizeof(struct ip);
+
+	/* The packet must be at least the size of an IP header. */
+	if (m->m_pkthdr.len < len)
+		return IPPROTO_DONE;
+
+	/* The fixed IP header must reside completely in the first mbuf. */
+	if (m->m_len < len)
+		return IPPROTO_DONE;
+
+	ip = mtodo(m, hoff);
+
+	/* Bound check the packet's stated IP header length. */
+	iphlen = ip->ip_hl << 2;
+	if (iphlen < sizeof(struct ip))		/* minimum header length */
+		return IPPROTO_DONE;
+
+	/* The full IP header must reside completely in the one mbuf. */
+	if (m->m_len < hoff + iphlen)
+		return IPPROTO_DONE;
+
+	iplen = ntohs(ip->ip_len);
+
+	/*
+	 * Check that the amount of data in the buffers is as
+	 * at least much as the IP header would have us expect.
+	 */
+	if (m->m_pkthdr.len < hoff + iplen)
+		return IPPROTO_DONE;
+
+	/*
+	 * Ignore IP fragments.
+	 */
+	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
+		return IPPROTO_DONE;
+
+	/*
+	 * The TCP/IP or UDP/IP header must be entirely contained within
+	 * the first fragment of a packet.
+	 */
+	switch (ip->ip_p) {
+	case IPPROTO_TCP:
+		if (iplen < iphlen + sizeof(struct tcphdr))
+			return IPPROTO_DONE;
+		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
+			return IPPROTO_DONE;
+		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
+		thoff = th->th_off << 2;
+		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
+			return IPPROTO_DONE;
+		if (m->m_len < hoff + iphlen + thoff)
+			return IPPROTO_DONE;
+		break;
+	case IPPROTO_UDP:
+		if (iplen < iphlen + sizeof(struct udphdr))
+			return IPPROTO_DONE;
+		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
+			return IPPROTO_DONE;
+		break;
+	default:
+		if (iplen < iphlen)
+			return IPPROTO_DONE;
+		break;
+	}
+	return ip->ip_p;
+}
+
 static device_method_t netvsc_methods[] = {
         /* Device interface */
         DEVMETHOD(device_probe,         netvsc_probe),

Modified: head/sys/dev/hyperv/netvsc/hv_rndis.h
==============================================================================
--- head/sys/dev/hyperv/netvsc/hv_rndis.h	Tue Jan 12 01:23:45 2016	(r293718)
+++ head/sys/dev/hyperv/netvsc/hv_rndis.h	Tue Jan 12 01:30:51 2016	(r293719)
@@ -1049,6 +1049,7 @@ typedef struct rndismp_rx_bufs_info_ {
 int netvsc_recv(struct hv_device *device_ctx, 
     netvsc_packet *packet, 
     rndis_tcp_ip_csum_info *csum_info);
+void netvsc_recv_rollup(struct hv_device *device_ctx);
 
 void* hv_set_rppi_data(rndis_msg *rndis_mesg,
     uint32_t rppi_size,

Modified: head/sys/dev/hyperv/netvsc/hv_rndis_filter.c
==============================================================================
--- head/sys/dev/hyperv/netvsc/hv_rndis_filter.c	Tue Jan 12 01:23:45 2016	(r293718)
+++ head/sys/dev/hyperv/netvsc/hv_rndis_filter.c	Tue Jan 12 01:30:51 2016	(r293719)
@@ -963,3 +963,14 @@ hv_rf_on_send_request_halt_completion(vo
 	request->halt_complete_flag = 1;
 }
 
+/*
+ * RNDIS filter when "all" reception is done
+ */
+void
+hv_rf_receive_rollup(netvsc_dev *net_dev)
+{
+	rndis_device *rndis_dev;
+
+	rndis_dev = (rndis_device *)net_dev->extension;
+	netvsc_recv_rollup(rndis_dev->net_dev->dev);
+}

Modified: head/sys/dev/hyperv/netvsc/hv_rndis_filter.h
==============================================================================
--- head/sys/dev/hyperv/netvsc/hv_rndis_filter.h	Tue Jan 12 01:23:45 2016	(r293718)
+++ head/sys/dev/hyperv/netvsc/hv_rndis_filter.h	Tue Jan 12 01:30:51 2016	(r293719)
@@ -98,6 +98,7 @@ typedef struct rndis_device_ {
 
 int hv_rf_on_receive(netvsc_dev *net_dev,
     struct hv_device *device, netvsc_packet *pkt);
+void hv_rf_receive_rollup(netvsc_dev *net_dev);
 int hv_rf_on_device_add(struct hv_device *device, void *additl_info);
 int hv_rf_on_device_remove(struct hv_device *device, boolean_t destroy_channel);
 int hv_rf_on_open(struct hv_device *device);


More information about the svn-src-head mailing list