git: f0f3e3e961d3 - main - ipmi: use a queue for kcs driver requests when possible

From: Chuck Silvers <chs_at_FreeBSD.org>
Date: Tue, 01 Nov 2022 17:55:22 UTC
The branch main has been updated by chs:

URL: https://cgit.FreeBSD.org/src/commit/?id=f0f3e3e961d311f1cd938f1319385e7f454525f1

commit f0f3e3e961d311f1cd938f1319385e7f454525f1
Author:     Chuck Silvers <chs@FreeBSD.org>
AuthorDate: 2022-11-01 17:55:14 +0000
Commit:     Chuck Silvers <chs@FreeBSD.org>
CommitDate: 2022-11-01 17:55:14 +0000

    ipmi: use a queue for kcs driver requests when possible
    
    The ipmi watchdog pretimeout action can trigger unintentionally in
    certain rare, complicated situations.  What we have seen at Netflix
    is that the BMC can sometimes be sent a continuous stream of
    writes to port 0x80, and due to what is a bug or misconfiguration
    in the BMC software, this results in the BMC running out of memory,
    becoming very slow to respond to KCS requests, and eventually being
    rebooted by its own internal watchdog.  While that is going on in
    the BMC, back in the host OS, a number of requests are pending in
    the ipmi request queue, and the kcs_loop thread is working on
    processing these requests.  All of the KCS accesses to process
    those requests are timing out and eventually failing because the
    BMC is responding very slowly or not at all, and the kcs_loop thread
    is holding the IPMI_IO_LOCK the whole time that is going on.
    Meanwhile the watchdogd process in the host is trying to pat the
    BMC watchdog, and this process is sleeping waiting to get the
    IPMI_IO_LOCK.  It's not entirely clear why the watchdogd process
    is sleeping for this lock, because the intention is that a thread
    holding the IPMI_IO_LOCK should not sleep and thus any thread
    that wants the lock should just spin to wait for it.  My best guess
    is that the kcs_loop thread is spinning waiting for the BMC to
    respond for so long that it is eventually preempted, and during
    the brief interval when the kcs_loop thread is not running,
    the watchdogd thread notices that the lock holder is not running
    and sleeps.  When the kcs_loop thread eventually finishes processing
    one request, it drops the IPMI_IO_LOCK and then immediately takes the
    lock again so it can process the next request in the queue.
    Because the watchdogd thread is sleeping at this point, the kcs_loop
    always wins the race to acquire the IPMI_IO_LOCK, thus starving
    the watchdogd thread.  The callout for the watchdog pretimeout
    would be reset by the watchdogd thread after its request to the BMC
    watchdog completes, but since that request never processed, the
    pretimeout callout eventually fires, even though there is nothing
    actually wrong with the host.
    
    To prevent this saga from unfolding:
    
     - when kcs_driver_request() is called in a context where it can sleep,
       queue the request and let the worker thread process it rather than
       trying to process in the original thread.
     - add a new high-priority queue for driver requests, so that the
       watchdog patting requests will be processed as quickly as possible
       even if lots of application requests have already been queued.
    
    With these two changes, the watchdog pretimeout action does not trigger
    even if the BMC is completely out to lunch for long periods of time
    (as long as the watchdogd check command does not also get stuck).
    
    Sponsored by:   Netflix
    Reviewed by:    imp
    Differential Revision:  https://reviews.freebsd.org/D36555
---
 sys/dev/ipmi/ipmi.c     | 33 ++++++++++++++++++++++++++++++---
 sys/dev/ipmi/ipmi_kcs.c | 28 +++++++++++++++++++++++++++-
 sys/dev/ipmi/ipmivars.h |  2 ++
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/sys/dev/ipmi/ipmi.c b/sys/dev/ipmi/ipmi.c
index d79690d55c68..b8705a81627b 100644
--- a/sys/dev/ipmi/ipmi.c
+++ b/sys/dev/ipmi/ipmi.c
@@ -208,6 +208,15 @@ ipmi_dtor(void *arg)
 	IPMI_LOCK(sc);
 	if (dev->ipmi_requests) {
 		/* Throw away any pending requests for this device. */
+		TAILQ_FOREACH_SAFE(req, &sc->ipmi_pending_requests_highpri, ir_link,
+		    nreq) {
+			if (req->ir_owner == dev) {
+				TAILQ_REMOVE(&sc->ipmi_pending_requests_highpri, req,
+				    ir_link);
+				dev->ipmi_requests--;
+				ipmi_free_request(req);
+			}
+		}
 		TAILQ_FOREACH_SAFE(req, &sc->ipmi_pending_requests, ir_link,
 		    nreq) {
 			if (req->ir_owner == dev) {
@@ -579,13 +588,19 @@ ipmi_dequeue_request(struct ipmi_softc *sc)
 
 	IPMI_LOCK_ASSERT(sc);
 
-	while (!sc->ipmi_detaching && TAILQ_EMPTY(&sc->ipmi_pending_requests))
+	while (!sc->ipmi_detaching && TAILQ_EMPTY(&sc->ipmi_pending_requests) &&
+	    TAILQ_EMPTY(&sc->ipmi_pending_requests_highpri))
 		cv_wait(&sc->ipmi_request_added, &sc->ipmi_requests_lock);
 	if (sc->ipmi_detaching)
 		return (NULL);
 
-	req = TAILQ_FIRST(&sc->ipmi_pending_requests);
-	TAILQ_REMOVE(&sc->ipmi_pending_requests, req, ir_link);
+	req = TAILQ_FIRST(&sc->ipmi_pending_requests_highpri);
+	if (req != NULL)
+		TAILQ_REMOVE(&sc->ipmi_pending_requests_highpri, req, ir_link);
+	else {
+		req = TAILQ_FIRST(&sc->ipmi_pending_requests);
+		TAILQ_REMOVE(&sc->ipmi_pending_requests, req, ir_link);
+	}
 	return (req);
 }
 
@@ -601,6 +616,17 @@ ipmi_polled_enqueue_request(struct ipmi_softc *sc, struct ipmi_request *req)
 	return (0);
 }
 
+int
+ipmi_polled_enqueue_request_highpri(struct ipmi_softc *sc, struct ipmi_request *req)
+{
+
+	IPMI_LOCK_ASSERT(sc);
+
+	TAILQ_INSERT_TAIL(&sc->ipmi_pending_requests_highpri, req, ir_link);
+	cv_signal(&sc->ipmi_request_added);
+	return (0);
+}
+
 /*
  * Watchdog event handler.
  */
@@ -817,6 +843,7 @@ ipmi_startup(void *arg)
 	mtx_init(&sc->ipmi_requests_lock, "ipmi requests", NULL, MTX_DEF);
 	mtx_init(&sc->ipmi_io_lock, "ipmi io", NULL, MTX_DEF);
 	cv_init(&sc->ipmi_request_added, "ipmireq");
+	TAILQ_INIT(&sc->ipmi_pending_requests_highpri);
 	TAILQ_INIT(&sc->ipmi_pending_requests);
 
 	/* Initialize interface-dependent state. */
diff --git a/sys/dev/ipmi/ipmi_kcs.c b/sys/dev/ipmi/ipmi_kcs.c
index df3b37614eb7..5908ec88e039 100644
--- a/sys/dev/ipmi/ipmi_kcs.c
+++ b/sys/dev/ipmi/ipmi_kcs.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
+#include <sys/conf.h>
 #include <sys/condvar.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
@@ -490,7 +491,21 @@ kcs_startup(struct ipmi_softc *sc)
 }
 
 static int
-kcs_driver_request(struct ipmi_softc *sc, struct ipmi_request *req, int timo)
+kcs_driver_request_queue(struct ipmi_softc *sc, struct ipmi_request *req, int timo)
+{
+	int error;
+
+	IPMI_LOCK(sc);
+	ipmi_polled_enqueue_request_highpri(sc, req);
+	error = msleep(req, &sc->ipmi_requests_lock, 0, "ipmireq", timo);
+	if (error == 0)
+		error = req->ir_error;
+	IPMI_UNLOCK(sc);
+	return (error);
+}
+
+static int
+kcs_driver_request_poll(struct ipmi_softc *sc, struct ipmi_request *req)
 {
 	int i, ok;
 
@@ -504,6 +519,17 @@ kcs_driver_request(struct ipmi_softc *sc, struct ipmi_request *req, int timo)
 	return (req->ir_error);
 }
 
+static int
+kcs_driver_request(struct ipmi_softc *sc, struct ipmi_request *req, int timo)
+{
+
+	if (KERNEL_PANICKED() || dumping)
+		return (kcs_driver_request_poll(sc, req));
+	else
+		return (kcs_driver_request_queue(sc, req, timo));
+}
+
+
 int
 ipmi_kcs_attach(struct ipmi_softc *sc)
 {
diff --git a/sys/dev/ipmi/ipmivars.h b/sys/dev/ipmi/ipmivars.h
index b0548ee3d7c3..1468e86be73b 100644
--- a/sys/dev/ipmi/ipmivars.h
+++ b/sys/dev/ipmi/ipmivars.h
@@ -111,6 +111,7 @@ struct ipmi_softc {
 	uint8_t			ipmi_dev_support;	/* IPMI_ADS_* */
 	struct cdev		*ipmi_cdev;
 	TAILQ_HEAD(,ipmi_request) ipmi_pending_requests;
+	TAILQ_HEAD(,ipmi_request) ipmi_pending_requests_highpri;
 	int			ipmi_driver_requests_polled;
 	eventhandler_tag	ipmi_power_cycle_tag;
 	eventhandler_tag	ipmi_watchdog_tag;
@@ -237,6 +238,7 @@ void	ipmi_complete_request(struct ipmi_softc *, struct ipmi_request *);
 struct ipmi_request *ipmi_dequeue_request(struct ipmi_softc *);
 void	ipmi_free_request(struct ipmi_request *);
 int	ipmi_polled_enqueue_request(struct ipmi_softc *, struct ipmi_request *);
+int	ipmi_polled_enqueue_request_highpri(struct ipmi_softc *, struct ipmi_request *);
 int	ipmi_submit_driver_request(struct ipmi_softc *, struct ipmi_request *,
 	    int);