svn commit: r306520 - in stable: 10/share/man/man9 10/sys/amd64/vmm/io 10/sys/dev/pci 11/share/man/man9 11/sys/amd64/vmm/io 11/sys/dev/pci

John Baldwin jhb at FreeBSD.org
Fri Sep 30 18:47:37 UTC 2016


Author: jhb
Date: Fri Sep 30 18:47:34 2016
New Revision: 306520
URL: https://svnweb.freebsd.org/changeset/base/306520

Log:
  MFC 305502: Reset PCI pass through devices via PCI-e FLR during VM start/end.
  
  Add routines to trigger a function level reset (FLR) of a PCI-express
  device via the PCI-express device control register.  This also includes
  support routines to wait for pending transactions to complete as well
  as calculating the maximum completion timeout permitted by a device.
  
  Change the ppt(4) driver to reset pass through devices before attaching
  to a VM during startup and before detaching from a VM during shutdown.
  
  Sponsored by:	Chelsio Communications

Modified:
  stable/11/share/man/man9/Makefile
  stable/11/share/man/man9/pci.9
  stable/11/sys/amd64/vmm/io/ppt.c
  stable/11/sys/dev/pci/pci.c
  stable/11/sys/dev/pci/pcireg.h
  stable/11/sys/dev/pci/pcivar.h
Directory Properties:
  stable/11/   (props changed)

Changes in other areas also in this revision:
Modified:
  stable/10/share/man/man9/Makefile
  stable/10/share/man/man9/pci.9
  stable/10/sys/amd64/vmm/io/ppt.c
  stable/10/sys/dev/pci/pci.c
  stable/10/sys/dev/pci/pcireg.h
  stable/10/sys/dev/pci/pcivar.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/11/share/man/man9/Makefile
==============================================================================
--- stable/11/share/man/man9/Makefile	Fri Sep 30 18:43:39 2016	(r306519)
+++ stable/11/share/man/man9/Makefile	Fri Sep 30 18:47:34 2016	(r306520)
@@ -1319,7 +1319,10 @@ MLINKS+=pci.9 pci_alloc_msi.9 \
 	pci.9 pci_set_max_read_req.9 \
 	pci.9 pci_write_config.9 \
 	pci.9 pcie_adjust_config.9 \
+	pci.9 pcie_flr.9 \
+	pci.9 pcie_max_completion_timeout.9 \
 	pci.9 pcie_read_config.9 \
+	pci.9 pcie_wait_for_pending_transactions.9 \
 	pci.9 pcie_write_config.9
 MLINKS+=pci_iov_schema.9 pci_iov_schema_alloc_node.9 \
 	pci_iov_schema.9 pci_iov_schema_add_bool.9 \

Modified: stable/11/share/man/man9/pci.9
==============================================================================
--- stable/11/share/man/man9/pci.9	Fri Sep 30 18:43:39 2016	(r306519)
+++ stable/11/share/man/man9/pci.9	Fri Sep 30 18:47:34 2016	(r306520)
@@ -66,7 +66,10 @@
 .Nm pci_set_powerstate ,
 .Nm pci_write_config ,
 .Nm pcie_adjust_config ,
+.Nm pcie_flr ,
+.Nm pcie_get_max_completion_timeout ,
 .Nm pcie_read_config ,
+.Nm pcie_wait_for_pending_transactions ,
 .Nm pcie_write_config
 .Nd PCI bus interface
 .Sh SYNOPSIS
@@ -145,8 +148,14 @@
 .Fa "uint32_t val"
 .Fa "int width"
 .Fc
+.Ft bool
+.Fn pcie_flr "device_t dev" "u_int max_delay" "bool force"
+.Ft int
+.Fn pcie_get_max_completion_timeout "device_t dev"
 .Ft uint32_t
 .Fn pcie_read_config "device_t dev" "int reg" "int width"
+.Ft bool
+.Fn pcie_wait_for_pending_transactions "device_t dev" "u_int max_delay"
 .Ft void
 .Fn pcie_write_config "device_t dev" "int reg" "uint32_t val" "int width"
 .Ft void
@@ -431,6 +440,51 @@ keyword,
 then
 .Fn pci_get_vpd_readonly
 returns an error.
+.Pp
+The
+.Fn pcie_get_max_completion_timeout
+function returns the maximum completion timeout configured for the device
+.Fa dev
+in microseconds.
+If the
+.Fa dev
+device is not a PCI-express device,
+.Fn pcie_get_max_completion_timeout
+returns zero.
+When completion timeouts are disabled for
+.Fa dev ,
+this function returns the maxmimum timeout that would be used if timeouts
+were enabled.
+.Pp
+The
+.Fn pcie_wait_for_pending_transactions
+function waits for any pending transactions initiated by the
+.Fa dev
+device to complete.
+The function checks for pending transactions by polling the transactions
+pending flag in the PCI-express device status register.
+It returns
+.Dv true
+once the transaction pending flag is clear.
+If transactions are still pending after
+.Fa max_delay
+milliseconds,
+.Fn pcie_wait_for_pending_transactions
+returns
+.Dv false .
+If
+.Fa max_delay
+is set to zero,
+.Fn pcie_wait_for_pending_transactions
+performs a single check;
+otherwise,
+this function may sleep while polling the transactions pending flag.
+.Nm pcie_wait_for_pending_transactions
+returns
+.Dv true
+if
+.Fa dev
+is not a PCI-express device.
 .Ss Device Configuration
 The
 .Fn pci_enable_busmaster
@@ -662,6 +716,51 @@ is invoked,
 then the device will be transitioned to
 .Dv PCI_POWERSTATE_D0
 before any config registers are restored.
+.Pp
+The
+.Fn pcie_flr
+function requests a Function Level Reset
+.Pq FLR
+of
+.Fa dev .
+If
+.Fa dev
+is not a PCI-express device or does not support Function Level Resets via
+the PCI-express device control register,
+.Dv false
+is returned.
+Pending transactions are drained by disabling busmastering and calling
+.Fn pcie_wait_for_pending_transactions
+before resetting the device.
+The
+.Fa max_delay
+argument specifies the maximum timeout to wait for pending transactions as
+described for
+.Fn pcie_wait_for_pending_transactions .
+If
+.Fn pcie_wait_for_pending_transactions
+fails with a timeout and
+.Fa force
+is
+.Dv false ,
+busmastering is re-enabled and
+.Dv false
+is returned.
+If
+.Fn pcie_wait_for_pending_transactions
+fails with a timeout and
+.Fa force
+is
+.Dv true ,
+the device is reset despite the timeout.
+After the reset has been requested,
+.Nm pcie_flr
+sleeps for at least 100 milliseconds before returning
+.Dv true .
+Note that
+.Nm pcie_flr
+does not save and restore any state around the reset.
+The caller should save and restore state as needed.
 .Ss Message Signaled Interrupts
 Message Signaled Interrupts
 .Pq MSI

Modified: stable/11/sys/amd64/vmm/io/ppt.c
==============================================================================
--- stable/11/sys/amd64/vmm/io/ppt.c	Fri Sep 30 18:43:39 2016	(r306519)
+++ stable/11/sys/amd64/vmm/io/ppt.c	Fri Sep 30 18:47:34 2016	(r306520)
@@ -362,6 +362,11 @@ ppt_assign_device(struct vm *vm, int bus
 		if (ppt->vm != NULL && ppt->vm != vm)
 			return (EBUSY);
 
+		pci_save_state(ppt->dev);
+		pcie_flr(ppt->dev,
+		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
+		    true);
+		pci_restore_state(ppt->dev);
 		ppt->vm = vm;
 		iommu_remove_device(iommu_host_domain(), pci_get_rid(ppt->dev));
 		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
@@ -382,6 +387,12 @@ ppt_unassign_device(struct vm *vm, int b
 		 */
 		if (ppt->vm != vm)
 			return (EBUSY);
+
+		pci_save_state(ppt->dev);
+		pcie_flr(ppt->dev,
+		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
+		    true);
+		pci_restore_state(ppt->dev);
 		ppt_unmap_mmio(vm, ppt);
 		ppt_teardown_msi(ppt);
 		ppt_teardown_msix(ppt);

Modified: stable/11/sys/dev/pci/pci.c
==============================================================================
--- stable/11/sys/dev/pci/pci.c	Fri Sep 30 18:43:39 2016	(r306519)
+++ stable/11/sys/dev/pci/pci.c	Fri Sep 30 18:47:34 2016	(r306520)
@@ -5891,3 +5891,165 @@ pci_find_pcie_root_port(device_t dev)
 		dev = pcib;
 	}
 }
+
+/*
+ * Wait for pending transactions to complete on a PCI-express function.
+ *
+ * The maximum delay is specified in milliseconds in max_delay.  Note
+ * that this function may sleep.
+ *
+ * Returns true if the function is idle and false if the timeout is
+ * exceeded.  If dev is not a PCI-express function, this returns true.
+ */
+bool
+pcie_wait_for_pending_transactions(device_t dev, u_int max_delay)
+{
+	struct pci_devinfo *dinfo = device_get_ivars(dev);
+	uint16_t sta;
+	int cap;
+
+	cap = dinfo->cfg.pcie.pcie_location;
+	if (cap == 0)
+		return (true);
+
+	sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2);
+	while (sta & PCIEM_STA_TRANSACTION_PND) {
+		if (max_delay == 0)
+			return (false);
+
+		/* Poll once every 100 milliseconds up to the timeout. */
+		if (max_delay > 100) {
+			pause_sbt("pcietp", 100 * SBT_1MS, 0, C_HARDCLOCK);
+			max_delay -= 100;
+		} else {
+			pause_sbt("pcietp", max_delay * SBT_1MS, 0,
+			    C_HARDCLOCK);
+			max_delay = 0;
+		}
+		sta = pci_read_config(dev, cap + PCIER_DEVICE_STA, 2);
+	}
+
+	return (true);
+}
+
+/*
+ * Determine the maximum Completion Timeout in microseconds.
+ *
+ * For non-PCI-express functions this returns 0.
+ */
+int
+pcie_get_max_completion_timeout(device_t dev)
+{
+	struct pci_devinfo *dinfo = device_get_ivars(dev);
+	int cap;
+
+	cap = dinfo->cfg.pcie.pcie_location;
+	if (cap == 0)
+		return (0);
+
+	/*
+	 * Functions using the 1.x spec use the default timeout range of
+	 * 50 microseconds to 50 milliseconds.  Functions that do not
+	 * support programmable timeouts also use this range.
+	 */
+	if ((dinfo->cfg.pcie.pcie_flags & PCIEM_FLAGS_VERSION) < 2 ||
+	    (pci_read_config(dev, cap + PCIER_DEVICE_CAP2, 4) &
+	    PCIEM_CAP2_COMP_TIMO_RANGES) == 0)
+		return (50 * 1000);
+
+	switch (pci_read_config(dev, cap + PCIER_DEVICE_CTL2, 2) &
+	    PCIEM_CTL2_COMP_TIMO_VAL) {
+	case PCIEM_CTL2_COMP_TIMO_100US:
+		return (100);
+	case PCIEM_CTL2_COMP_TIMO_10MS:
+		return (10 * 1000);
+	case PCIEM_CTL2_COMP_TIMO_55MS:
+		return (55 * 1000);
+	case PCIEM_CTL2_COMP_TIMO_210MS:
+		return (210 * 1000);
+	case PCIEM_CTL2_COMP_TIMO_900MS:
+		return (900 * 1000);
+	case PCIEM_CTL2_COMP_TIMO_3500MS:
+		return (3500 * 1000);
+	case PCIEM_CTL2_COMP_TIMO_13S:
+		return (13 * 1000 * 1000);
+	case PCIEM_CTL2_COMP_TIMO_64S:
+		return (64 * 1000 * 1000);
+	default:
+		return (50 * 1000);
+	}
+}
+
+/*
+ * Perform a Function Level Reset (FLR) on a device.
+ *
+ * This function first waits for any pending transactions to complete
+ * within the timeout specified by max_delay.  If transactions are
+ * still pending, the function will return false without attempting a
+ * reset.
+ *
+ * If dev is not a PCI-express function or does not support FLR, this
+ * function returns false.
+ *
+ * Note that no registers are saved or restored.  The caller is
+ * responsible for saving and restoring any registers including
+ * PCI-standard registers via pci_save_state() and
+ * pci_restore_state().
+ */
+bool
+pcie_flr(device_t dev, u_int max_delay, bool force)
+{
+	struct pci_devinfo *dinfo = device_get_ivars(dev);
+	uint16_t cmd, ctl;
+	int compl_delay;
+	int cap;
+
+	cap = dinfo->cfg.pcie.pcie_location;
+	if (cap == 0)
+		return (false);
+
+	if (!(pci_read_config(dev, cap + PCIER_DEVICE_CAP, 4) & PCIEM_CAP_FLR))
+		return (false);
+
+	/*
+	 * Disable busmastering to prevent generation of new
+	 * transactions while waiting for the device to go idle.  If
+	 * the idle timeout fails, the command register is restored
+	 * which will re-enable busmastering.
+	 */
+	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
+	pci_write_config(dev, PCIR_COMMAND, cmd & ~(PCIM_CMD_BUSMASTEREN), 2);
+	if (!pcie_wait_for_pending_transactions(dev, max_delay)) {
+		if (!force) {
+			pci_write_config(dev, PCIR_COMMAND, cmd, 2);
+			return (false);
+		}
+		pci_printf(&dinfo->cfg,
+		    "Resetting with transactions pending after %d ms\n",
+		    max_delay);
+
+		/*
+		 * Extend the post-FLR delay to cover the maximum
+		 * Completion Timeout delay of anything in flight
+		 * during the FLR delay.  Enforce a minimum delay of
+		 * at least 10ms.
+		 */
+		compl_delay = pcie_get_max_completion_timeout(dev) / 1000;
+		if (compl_delay < 10)
+			compl_delay = 10;
+	} else
+		compl_delay = 0;
+
+	/* Initiate the reset. */
+	ctl = pci_read_config(dev, cap + PCIER_DEVICE_CTL, 2);
+	pci_write_config(dev, cap + PCIER_DEVICE_CTL, ctl |
+	    PCIEM_CTL_INITIATE_FLR, 2);
+
+	/* Wait for 100ms. */
+	pause_sbt("pcieflr", (100 + compl_delay) * SBT_1MS, 0, C_HARDCLOCK);
+
+	if (pci_read_config(dev, cap + PCIER_DEVICE_STA, 2) &
+	    PCIEM_STA_TRANSACTION_PND)
+		pci_printf(&dinfo->cfg, "Transactions pending after FLR!\n");
+	return (true);
+}

Modified: stable/11/sys/dev/pci/pcireg.h
==============================================================================
--- stable/11/sys/dev/pci/pcireg.h	Fri Sep 30 18:43:39 2016	(r306519)
+++ stable/11/sys/dev/pci/pcireg.h	Fri Sep 30 18:47:34 2016	(r306520)
@@ -885,10 +885,25 @@
 #define	PCIEM_ROOT_STA_PME_STATUS	0x00010000
 #define	PCIEM_ROOT_STA_PME_PEND		0x00020000
 #define	PCIER_DEVICE_CAP2	0x24
-#define	PCIEM_CAP2_ARI		0x20
+#define	PCIEM_CAP2_COMP_TIMO_RANGES	0x0000000f
+#define	PCIEM_CAP2_COMP_TIMO_RANGE_A	0x00000001
+#define	PCIEM_CAP2_COMP_TIMO_RANGE_B	0x00000002
+#define	PCIEM_CAP2_COMP_TIMO_RANGE_C	0x00000004
+#define	PCIEM_CAP2_COMP_TIMO_RANGE_D	0x00000008
+#define	PCIEM_CAP2_COMP_TIMO_DISABLE	0x00000010
+#define	PCIEM_CAP2_ARI			0x00000020
 #define	PCIER_DEVICE_CTL2	0x28
-#define	PCIEM_CTL2_COMP_TIMEOUT_VAL	0x000f
-#define	PCIEM_CTL2_COMP_TIMEOUT_DIS	0x0010
+#define	PCIEM_CTL2_COMP_TIMO_VAL	0x000f
+#define	PCIEM_CTL2_COMP_TIMO_50MS	0x0000
+#define	PCIEM_CTL2_COMP_TIMO_100US	0x0001
+#define	PCIEM_CTL2_COMP_TIMO_10MS	0x0002
+#define	PCIEM_CTL2_COMP_TIMO_55MS	0x0005
+#define	PCIEM_CTL2_COMP_TIMO_210MS	0x0006
+#define	PCIEM_CTL2_COMP_TIMO_900MS	0x0009
+#define	PCIEM_CTL2_COMP_TIMO_3500MS	0x000a
+#define	PCIEM_CTL2_COMP_TIMO_13S	0x000d
+#define	PCIEM_CTL2_COMP_TIMO_64S	0x000e
+#define	PCIEM_CTL2_COMP_TIMO_DISABLE	0x0010
 #define	PCIEM_CTL2_ARI			0x0020
 #define	PCIEM_CTL2_ATOMIC_REQ_ENABLE	0x0040
 #define	PCIEM_CTL2_ATOMIC_EGR_BLOCK	0x0080

Modified: stable/11/sys/dev/pci/pcivar.h
==============================================================================
--- stable/11/sys/dev/pci/pcivar.h	Fri Sep 30 18:43:39 2016	(r306519)
+++ stable/11/sys/dev/pci/pcivar.h	Fri Sep 30 18:47:34 2016	(r306520)
@@ -595,7 +595,9 @@ uint32_t pcie_read_config(device_t dev, 
 void	pcie_write_config(device_t dev, int reg, uint32_t value, int width);
 uint32_t pcie_adjust_config(device_t dev, int reg, uint32_t mask,
 	    uint32_t value, int width);
-
+bool	pcie_flr(device_t dev, u_int max_delay, bool force);
+int	pcie_get_max_completion_timeout(device_t dev);
+bool	pcie_wait_for_pending_transactions(device_t dev, u_int max_delay);
 
 #ifdef BUS_SPACE_MAXADDR
 #if (BUS_SPACE_MAXADDR > 0xFFFFFFFF)


More information about the svn-src-all mailing list