svn commit: r279453 - in head/sys: dev/pci sys

Ryan Stone rstone at FreeBSD.org
Sun Mar 1 00:40:59 UTC 2015


Author: rstone
Date: Sun Mar  1 00:40:57 2015
New Revision: 279453
URL: https://svnweb.freebsd.org/changeset/base/279453

Log:
  Pass SR-IOV configuration to kernel using an nvlist
  
  Pass all SR-IOV configuration to the kernel using an nvlist.  The
  main benefit that this offers is flexibility.  It allows a driver
  to accept any number of parameters of any type supported by the
  SR-IOV configuration infrastructure with having to make any
  changes outside of the driver.
  
  It also offers the user very fine-grained control over the
  configuration of the VFs -- if they want, they can have different
  configuration applied to every VF.
  
  Differential Revision:	https://reviews.freebsd.org/D82
  Reviewed by:		jhb
  MFC after: 		1 month
  Sponsored by:		Sandvine Inc.

Modified:
  head/sys/dev/pci/pci_if.m
  head/sys/dev/pci/pci_iov.c
  head/sys/sys/iov.h

Modified: head/sys/dev/pci/pci_if.m
==============================================================================
--- head/sys/dev/pci/pci_if.m	Sun Mar  1 00:40:51 2015	(r279452)
+++ head/sys/dev/pci/pci_if.m	Sun Mar  1 00:40:57 2015	(r279453)
@@ -217,6 +217,7 @@ METHOD int iov_detach {
 METHOD int init_iov {
 	device_t		dev;
 	uint16_t		num_vfs;
+	const struct nvlist	*config;
 };
 
 METHOD void uninit_iov {
@@ -226,6 +227,7 @@ METHOD void uninit_iov {
 METHOD int add_vf {
 	device_t		dev;
 	uint16_t		vfnum;
+	const struct nvlist	*config;
 };
 
 METHOD device_t create_iov_child {

Modified: head/sys/dev/pci/pci_iov.c
==============================================================================
--- head/sys/dev/pci/pci_iov.c	Sun Mar  1 00:40:51 2015	(r279452)
+++ head/sys/dev/pci/pci_iov.c	Sun Mar  1 00:40:57 2015	(r279453)
@@ -70,6 +70,18 @@ static struct cdevsw iov_cdevsw = {
 	.d_ioctl = pci_iov_ioctl
 };
 
+SYSCTL_DECL(_hw_pci);
+
+/*
+ * The maximum amount of memory we will allocate for user configuration of an
+ * SR-IOV device.  1MB ought to be enough for anyone, but leave this 
+ * configurable just in case.
+ */
+static u_long pci_iov_max_config = 1024 * 1024;
+SYSCTL_ULONG(_hw_pci, OID_AUTO, iov_max_config, CTLFLAG_RWTUN,
+    &pci_iov_max_config, 0, "Maximum allowed size of SR-IOV configuration.");
+
+
 #define IOV_READ(d, r, w) \
 	pci_read_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, w)
 
@@ -348,6 +360,51 @@ pci_iov_add_bars(struct pcicfg_iov *iov,
 	}
 }
 
+static int
+pci_iov_parse_config(struct pcicfg_iov *iov, struct pci_iov_arg *arg,
+    nvlist_t **ret)
+{
+	void *packed_config;
+	nvlist_t *config;
+	int error;
+
+	config = NULL;
+	packed_config = NULL;
+
+	if (arg->len > pci_iov_max_config) {
+		error = EMSGSIZE;
+		goto out;
+	}
+
+	packed_config = malloc(arg->len, M_SRIOV, M_WAITOK);
+
+	error = copyin(arg->config, packed_config, arg->len);
+	if (error != 0)
+		goto out;
+
+	config = nvlist_unpack(packed_config, arg->len);
+	if (config == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = pci_iov_schema_validate_config(iov->iov_schema, config);
+	if (error != 0)
+		goto out;
+
+	error = nvlist_error(config);
+	if (error != 0)
+		goto out;
+
+	*ret = config;
+	config = NULL;
+
+out:
+	nvlist_destroy(config);
+	free(packed_config, M_SRIOV);
+	return (error);
+}
+
 /*
  * Set the ARI_EN bit in the lowest-numbered PCI function with the SR-IOV
  * capability.  This bit is only writeable on the lowest-numbered PF but
@@ -422,6 +479,16 @@ pci_iov_config_page_size(struct pci_devi
 }
 
 static int
+pci_init_iov(device_t dev, uint16_t num_vfs, const nvlist_t *config)
+{
+	const nvlist_t *device, *driver_config;
+
+	device = nvlist_get_nvlist(config, PF_CONFIG_NAME);
+	driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
+	return (PCI_INIT_IOV(dev, num_vfs, driver_config));
+}
+
+static int
 pci_iov_init_rman(device_t pf, struct pcicfg_iov *iov)
 {
 	int error;
@@ -479,9 +546,11 @@ pci_iov_setup_bars(struct pci_devinfo *d
 }
 
 static void
-pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const char *driver,
+pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const nvlist_t *config,
     uint16_t first_rid, uint16_t rid_stride)
 {
+	char device_name[VF_MAX_NAME];
+	const nvlist_t *device, *driver_config, *iov_config;
 	device_t bus, dev, vf;
 	struct pcicfg_iov *iov;
 	struct pci_devinfo *vfinfo;
@@ -498,12 +567,23 @@ pci_iov_enumerate_vfs(struct pci_devinfo
 	did = IOV_READ(dinfo, PCIR_SRIOV_VF_DID, 2);
 
 	for (i = 0; i < iov->iov_num_vfs; i++, next_rid += rid_stride) {
-
+		snprintf(device_name, sizeof(device_name), VF_PREFIX"%d", i);
+		device = nvlist_get_nvlist(config, device_name);
+		iov_config = nvlist_get_nvlist(device, IOV_CONFIG_NAME);
+		driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
 
 		vf = PCI_CREATE_IOV_CHILD(bus, dev, next_rid, vid, did);
 		if (vf == NULL)
 			break;
 
+		/*
+		 * If we are creating passthrough devices then force the ppt
+		 * driver to attach to prevent a VF driver from claiming the
+		 * VFs.
+		 */
+		if (nvlist_get_bool(iov_config, "passthrough"))
+			device_set_devclass(vf, "ppt");
+
 		vfinfo = device_get_ivars(vf);
 
 		vfinfo->cfg.iov = iov;
@@ -511,7 +591,7 @@ pci_iov_enumerate_vfs(struct pci_devinfo
 
 		pci_iov_add_bars(iov, vfinfo);
 
-		error = PCI_ADD_VF(dev, i);
+		error = PCI_ADD_VF(dev, i, driver_config);
 		if (error != 0) {
 			device_printf(dev, "Failed to add VF %d\n", i);
 			pci_delete_child(bus, vf);
@@ -525,14 +605,14 @@ static int
 pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
 {
 	device_t bus, dev;
-	const char *driver;
 	struct pci_devinfo *dinfo;
 	struct pcicfg_iov *iov;
+	nvlist_t *config;
 	int i, error;
 	uint16_t rid_off, rid_stride;
 	uint16_t first_rid, last_rid;
 	uint16_t iov_ctl;
-	uint16_t total_vfs;
+	uint16_t num_vfs, total_vfs;
 	int iov_inited;
 
 	mtx_lock(&Giant);
@@ -541,6 +621,7 @@ pci_iov_config(struct cdev *cdev, struct
 	dev = dinfo->cfg.dev;
 	bus = device_get_parent(dev);
 	iov_inited = 0;
+	config = NULL;
 
 	if ((iov->iov_flags & IOV_BUSY) || iov->iov_num_vfs != 0) {
 		mtx_unlock(&Giant);
@@ -548,22 +629,17 @@ pci_iov_config(struct cdev *cdev, struct
 	}
 	iov->iov_flags |= IOV_BUSY;
 
-	total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
+	error = pci_iov_parse_config(iov, arg, &config);
+	if (error != 0)
+		goto out;
 
-	if (arg->num_vfs > total_vfs) {
+	num_vfs = pci_iov_config_get_num_vfs(config);
+	total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
+	if (num_vfs > total_vfs) {
 		error = EINVAL;
 		goto out;
 	}
 
-	/*
-	 * If we are creating passthrough devices then force the ppt driver to
-	 * attach to prevent a VF driver from claming the VFs.
-	 */
-	if (arg->passthrough)
-		driver = "ppt";
-	else
-		driver = NULL;
-
 	error = pci_iov_config_page_size(dinfo);
 	if (error != 0)
 		goto out;
@@ -572,19 +648,18 @@ pci_iov_config(struct cdev *cdev, struct
 	if (error != 0)
 		goto out;
 
-	error = PCI_INIT_IOV(dev, arg->num_vfs);
-
+	error = pci_init_iov(dev, num_vfs, config);
 	if (error != 0)
 		goto out;
-
 	iov_inited = 1;
-	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, arg->num_vfs, 2);
+
+	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, num_vfs, 2);
 
 	rid_off = IOV_READ(dinfo, PCIR_SRIOV_VF_OFF, 2);
 	rid_stride = IOV_READ(dinfo, PCIR_SRIOV_VF_STRIDE, 2);
 
 	first_rid = pci_get_rid(dev) + rid_off;
-	last_rid = first_rid + (arg->num_vfs - 1) * rid_stride;
+	last_rid = first_rid + (num_vfs - 1) * rid_stride;
 
 	/* We don't yet support allocating extra bus numbers for VFs. */
 	if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) {
@@ -600,7 +675,7 @@ pci_iov_config(struct cdev *cdev, struct
 	if (error != 0)
 		goto out;
 
-	iov->iov_num_vfs = arg->num_vfs;
+	iov->iov_num_vfs = num_vfs;
 
 	error = pci_iov_setup_bars(dinfo);
 	if (error != 0)
@@ -612,7 +687,10 @@ pci_iov_config(struct cdev *cdev, struct
 
 	/* Per specification, we must wait 100ms before accessing VFs. */
 	pause("iov", roundup(hz, 10));
-	pci_iov_enumerate_vfs(dinfo, driver, first_rid, rid_stride);
+	pci_iov_enumerate_vfs(dinfo, config, first_rid, rid_stride);
+
+	nvlist_destroy(config);
+	iov->iov_flags &= ~IOV_BUSY;
 	mtx_unlock(&Giant);
 
 	return (0);
@@ -635,6 +713,8 @@ out:
 		rman_fini(&iov->rman);
 		iov->iov_flags &= ~IOV_RMAN_INITED;
 	}
+
+	nvlist_destroy(config);
 	iov->iov_num_vfs = 0;
 	iov->iov_flags &= ~IOV_BUSY;
 	mtx_unlock(&Giant);

Modified: head/sys/sys/iov.h
==============================================================================
--- head/sys/sys/iov.h	Sun Mar  1 00:40:51 2015	(r279452)
+++ head/sys/sys/iov.h	Sun Mar  1 00:40:57 2015	(r279453)
@@ -46,12 +46,6 @@
 #define	DEFAULT_SCHEMA_NAME	"DEFAULT"
 #define	REQUIRED_SCHEMA_NAME	"REQUIRED"
 
-struct pci_iov_arg
-{
-	int num_vfs;
-	int passthrough;
-};
-
 /*
  * Because each PF device is expected to expose a unique set of possible
  * configurations, the SR-IOV infrastructure dynamically queries the PF
@@ -168,7 +162,94 @@ struct pci_iov_schema
 	int error;
 };
 
-#define	IOV_CONFIG	_IOWR('p', 10, struct pci_iov_arg)
+/*
+ * SR-IOV configuration is passed to the kernel as a packed nvlist.  See nv(3)
+ * for the details of the nvlist API.  The expected format of the nvlist is:
+ *
+ * BASIC RULES
+ *   1) All keys are case-insensitive.
+ *   2) No keys that are not specified below may exist at any level of the
+ *      config nvlist.
+ *   3) Unless otherwise specified, all keys are optional.  It should go without
+ *      saying a key being mandatory is transitive: that is, if a key is
+ *      specified to contain a sub-nodes that contains a mandatory key, then
+ *      the outer key is implicitly mandatory.  If a key is mandatory then the
+ *      associated value is also mandatory.
+ *   4) Order of keys is irrelevant.
+ *
+ * TOP LEVEL OF CONFIG NVLIST
+ * 1) All keys specified in this section are mandatory.
+ * 2) There must be a top-level key with the name PF_CONFIG_NAME.  The value
+ *    associated is an nvlist that follows the "device node" format.  The
+ *    parameters in this node specify parameters that apply to the PF.
+ * 3) For every VF being configured (this is set via the "num_vfs" parameter
+ *    in the PF section), there must be a top-level key whose name is VF_PREFIX
+ *    immediately followed by the index of the VF as a decimal integer.  For
+ *    example, this would be VF-0 for the first VF.  VFs are numbered starting
+ *    from 0.  The value associated with this key follows the "device node"
+ *    format.  The parameters in this node specify configuration that applies
+ *    to the VF specified in the key.  Leading zeros are not permitted in VF
+ *    index.  Configuration for the second VF must be specified in a node with
+ *    the key VF-1.  VF-01 is not a valid key.
+ *
+ * DEVICE NODES
+ * 1) All keys specified in this section are mandatory.
+ * 2) The device node must contain a key with the name DRIVER_CONFIG_NAME.  The
+ *    value associated with this key is an nvlist following the subsystem node
+ *    format.  The parameters in this key specify configuration that is specific
+ *    to a particular device driver.
+ * 3) The device node must contain a key with the name IOV_CONFIG_NAME.  The
+ *    value associated with this key is an nvlist following the subsystem node
+ *    format.  The parameters in this key specify configuration that is consumed
+ *    by the SR-IOV infrastructure.
+ *
+ * SUBSYSTEM NODES
+ * 1) A subsystem node specifies configuration parameters that apply to a
+ *    particular subsystem (driver or infrastructure) of a particular device
+ *    (PF or individual VF).
+ *         Note: We will refer to the section of the configuration schema that
+ *               specifies the parameters for this subsystem and device
+ *               configuration as the device/subystem schema.
+ * 2) The subsystem node must contain only keys that correspond to parameters
+ *    that are specified in the device/subsystem schema.
+ * 3) Every parameter specified as required in the device/subsystem schema is
+ *    a mandatory key in the subsystem node.
+ *    Note:  All parameters that are not required in device/subsystem schema are
+ *           optional keys.  In particular, any parameter specified to have a
+ *           default value in the device/subsystem schema is optional.  The
+ *           kernel is responsible for applying default values.
+ * 4) The value of every parameter in the device node must conform to the
+ *    restrictions of the type specified for that parameter in the device/
+ *    subsystem schema.
+ *
+ * The following is an example of a valid configuration, when validated against
+ * the schema example given above.
+ *
+ * PF (NVLIST):
+ *     driver (NVLIST):
+ *     iov (NVLIST):
+ *         num_vfs (NUMBER): 3 (3) (0x3)
+ *         device (STRING): [ix0]
+ * VF-0 (NVLIST):
+ *     driver (NVLIST):
+ *         vlan (NUMBER): 1000 (1000) (0x3e8)
+ *     iov (NVLIST):
+ *         passthrough (BOOL): TRUE
+ * VF-1 (NVLIST):
+ *     driver (NVLIST):
+ *     iov (NVLIST):
+ * VF-2 (NVLIST):
+ *     driver (NVLIST):
+ *         mac-addr (BINARY): 6 020102030405
+ *     iov (NVLIST):
+ */
+struct pci_iov_arg
+{
+	void *config;
+	size_t len;
+};
+
+#define	IOV_CONFIG	_IOW('p', 10, struct pci_iov_arg)
 #define	IOV_DELETE	_IO('p', 11)
 #define	IOV_GET_SCHEMA	_IOWR('p', 12, struct pci_iov_schema)
 


More information about the svn-src-head mailing list