[PATCH] Support for S5 (soft power off) in bhyve

John Baldwin jhb at freebsd.org
Fri Dec 27 17:10:34 UTC 2013


On Thursday, December 26, 2013 11:37:47 am John Baldwin wrote:
> On Wednesday, December 25, 2013 10:43:27 am Michael Dexter wrote:
> > 
> > John,
> > 
> > This is awesome.
> > 
> > Can this be triggered from the host to shut down an unresponsive VM?
> > 
> > If so, syntax?
> 
> It cannot.  However, we could add a virtual power button that acts like an
> external ACPI power button.  We could then add a flag to bhyvectl to
> trigger it.  I will look into that.

I've implemented this.  Rather than doing it via bhyvectl, I turned SIGTERM
sent to bhyve itself into the virtual power button.  Thus, if you
'killall bhyve' or 'pkill bhyve' from the host, the guest will gracefully
shutdown using S5.  You can still use kill -9 to kill a hung guest (similar
to the 4 second power button override on real metal).

To do this I fleshed out the ACPI power management support a bit more adding
more of the implementation of PM1_EVT and PM1_CNT.  I also added an SMI_CMD
register with commands to enable and disable ACPI (for now all that does is
enable/disable the virtual power button as well as toggle SCI_EN in PM1_CNT).
I also added support for raising ACPI's SCI based on settings in PM1_EVT.

I extended mevent to handle EVFILT_SIGNAL events and use a mevent handler for
SIGTERM to simulate pressing the power button.  This just implements the
simple fixed-feature power button support via PM1_EVT rather than having a
control method button (which would require implementing GPE support).

While here, I made sure the SCI interrupt was active-lo / level-triggered,
and I changed PCI interrupts in the MP Table to explicitly use active-lo
since they were already explicitly using level-trigger.

I also fixed the pmtmr code to use pthread_once instead of a racy static
int.  That should probably be a separate commit though.

--- //depot/vendor/freebsd/src/usr.sbin/bhyve/acpi.c
+++ //depot/user/jhb/bhyve/usr.sbin/bhyve/acpi.c
@@ -85,10 +87,6 @@
 #define BHYVE_ASL_SUFFIX	".aml"
 #define BHYVE_ASL_COMPILER	"/usr/sbin/iasl"
 
-#define BHYVE_PM1A_EVT_ADDR	0x400
-#define BHYVE_PM1A_CNT_ADDR	0x404
-#define BHYVE_PM_TIMER_ADDR	0x408
-
 static int basl_keep_temps;
 static int basl_verbose_iasl;
 static int basl_ncpu;
@@ -285,11 +290,11 @@
 	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
 	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
 	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
-	EFPRINTF(fp, "[0001]\t\tSource : 09\n");
-	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000009\n");
+	EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT);
+	EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT);
 	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
-	EFPRINTF(fp, "\t\t\tPolarity : 0\n");
-	EFPRINTF(fp, "\t\t\tTrigger Mode : 0\n");
+	EFPRINTF(fp, "\t\t\tPolarity : 3\n");
+	EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n");
 	EFPRINTF(fp, "\n");
 
 	/* Local APIC NMI is connected to LINT 1 on all CPUs */
@@ -336,23 +341,27 @@
 	    basl_acpi_base + FACS_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
 	    basl_acpi_base + DSDT_OFFSET);
-	EFPRINTF(fp, "[0001]\t\tModel : 00\n");
+	EFPRINTF(fp, "[0001]\t\tModel : 01\n");
 	EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
-	EFPRINTF(fp, "[0002]\t\tSCI Interrupt : 0009\n");
-	EFPRINTF(fp, "[0004]\t\tSMI Command Port : 00000000\n");
-	EFPRINTF(fp, "[0001]\t\tACPI Enable Value : 00\n");
-	EFPRINTF(fp, "[0001]\t\tACPI Disable Value : 00\n");
+	EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n",
+	    SCI_INT);
+	EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n",
+	    SMI_CMD);
+	EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n",
+	    BHYVE_ACPI_ENABLE);
+	EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n",
+	    BHYVE_ACPI_DISABLE);
 	EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
 	EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
 	EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n",
-		 BHYVE_PM1A_EVT_ADDR);
+	    PM1A_EVT_ADDR);
 	EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n",
-		 BHYVE_PM1A_CNT_ADDR);
+	    PM1A_CNT_ADDR);
 	EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
-		 BHYVE_PM_TIMER_ADDR);
+	    IO_PMTMR);
 	EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
 	EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
 	EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
@@ -385,7 +394,7 @@
 	EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
-	EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
 	EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
 	EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
@@ -427,7 +436,7 @@
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
-	    BHYVE_PM1A_EVT_ADDR);
+	    PM1A_EVT_ADDR);
 	EFPRINTF(fp, "\n");
 	
 	EFPRINTF(fp,
@@ -447,7 +456,7 @@
 	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
 	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
-	    BHYVE_PM1A_CNT_ADDR);
+	    PM1A_CNT_ADDR);
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp,
@@ -479,7 +488,7 @@
 	EFPRINTF(fp,
 	    "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
 	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
-	    BHYVE_PM_TIMER_ADDR);
+	    IO_PMTMR);
 	EFPRINTF(fp, "\n");
 
 	EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
--- //depot/vendor/freebsd/src/usr.sbin/bhyve/acpi.h
+++ //depot/user/jhb/bhyve/usr.sbin/bhyve/acpi.h
@@ -29,6 +29,19 @@
 #ifndef _ACPI_H_
 #define _ACPI_H_
 
+#define	SCI_INT			9
+
+#define	SMI_CMD			0xb2
+#define	BHYVE_ACPI_ENABLE	0xa0
+#define	BHYVE_ACPI_DISABLE	0xa1
+
+#define	PM1A_EVT_ADDR		0x400
+#define	PM1A_CNT_ADDR		0x404
+
+#define	IO_PMTMR		0x408	/* 4-byte i/o port for the timer */
+
+struct vmctx;
+
 int	acpi_build(struct vmctx *ctx, int ncpu);
 
 #endif /* _ACPI_H_ */
--- //depot/vendor/freebsd/src/usr.sbin/bhyve/mevent.c
+++ //depot/user/jhb/bhyve/usr.sbin/bhyve/mevent.c
@@ -135,6 +135,9 @@
 	if (mevp->me_type == EVF_TIMER)
 		retval = EVFILT_TIMER;
 
+	if (mevp->me_type == EVF_SIGNAL)
+		retval = EVFILT_SIGNAL;
+
 	return (retval);
 }
 
@@ -437,7 +440,7 @@
 		 * Block awaiting events
 		 */
 		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
-		if (ret == -1) {
+		if (ret == -1 && errno != EINTR) {
 			perror("Error return from kevent monitor");
 		}
 		
--- //depot/vendor/freebsd/src/usr.sbin/bhyve/mevent.h
+++ //depot/user/jhb/bhyve/usr.sbin/bhyve/mevent.h
@@ -32,7 +32,8 @@
 enum ev_type {
 	EVF_READ,
 	EVF_WRITE,
-	EVF_TIMER
+	EVF_TIMER,
+	EVF_SIGNAL
 };
 
 struct mevent;
--- //depot/vendor/freebsd/src/usr.sbin/bhyve/mptbl.c
+++ //depot/user/jhb/bhyve/usr.sbin/bhyve/mptbl.c
@@ -36,6 +36,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include "acpi.h"
 #include "bhyverun.h"
 #include "mptbl.h"
 
@@ -227,13 +228,21 @@
 			mpie->int_type = INTENTRY_TYPE_INT;
 			mpie->src_bus_irq = 0;
 			break;
+		case SCI_INT:
+			/* ACPI SCI is level triggered and active-lo. */
+			mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO |
+			    INTENTRY_FLAGS_TRIGGER_LEVEL;
+			mpie->int_type = INTENTRY_TYPE_INT;
+			mpie->src_bus_irq = SCI_INT;
+			break;
 		case 5:
 		case 10:
 		case 11:
 			/*
-			 * PCI Irqs set to level triggered.
+			 * PCI Irqs set to level triggered and active-lo.
 			 */
-			mpie->int_flags = INTENTRY_FLAGS_TRIGGER_LEVEL;
+			mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO |
+			    INTENTRY_FLAGS_TRIGGER_LEVEL;
 			mpie->src_bus_id = 0;
 			/* fall through.. */
 		default:
--- //depot/vendor/freebsd/src/usr.sbin/bhyve/pm.c
+++ //depot/user/jhb/bhyve/usr.sbin/bhyve/pm.c
@@ -29,11 +29,20 @@
 __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pm.c 259826 2013-12-24 16:14:19Z jhb $");
 
 #include <sys/types.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <signal.h>
+#include <vmmapi.h>
 
+#include "acpi.h"
 #include "inout.h"
+#include "mevent.h"
 
-#define	PM1A_EVT_ADDR	0x400
-#define	PM1A_CNT_ADDR	0x404
+static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
+static struct mevent *power_button;
+static sig_t old_power_handler;
 
 /*
  * Reset Control register at I/O port 0xcf9.  Bit 2 forces a system
@@ -63,12 +73,75 @@
 INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler);
 
 /*
+ * ACPI's SCI is a level-triggered interrupt.
+ */
+static int sci_active;
+
+static void
+sci_assert(struct vmctx *ctx)
+{
+
+	if (sci_active)
+		return;
+	vm_ioapic_assert_irq(ctx, SCI_INT);
+	sci_active = 1;
+}
+
+static void
+sci_deassert(struct vmctx *ctx)
+{
+
+	if (!sci_active)
+		return;
+	vm_ioapic_deassert_irq(ctx, SCI_INT);
+	sci_active = 0;
+}
+
+/*
  * Power Management 1 Event Registers
  *
- * bhyve doesn't support any power management events currently, so the
- * status register always returns zero.  The enable register preserves
- * its value but has no effect.
+ * The only power management event supported is a power button upon
+ * receiving SIGTERM.
  */
+static uint16_t pm1_enable, pm1_status;
+
+#define	PM1_TMR_STS		0x0001
+#define	PM1_BM_STS		0x0010
+#define	PM1_GBL_STS		0x0020
+#define	PM1_PWRBTN_STS		0x0100
+#define	PM1_SLPBTN_STS		0x0200
+#define	PM1_RTC_STS		0x0400
+#define	PM1_WAK_STS		0x8000
+
+#define	PM1_TMR_EN		0x0001
+#define	PM1_GBL_EN		0x0020
+#define	PM1_PWRBTN_EN		0x0100
+#define	PM1_SLPBTN_EN		0x0200
+#define	PM1_RTC_EN		0x0400
+
+static void
+sci_update(struct vmctx *ctx)
+{
+	int need_sci;
+
+	/* See if the SCI should be active or not. */
+	need_sci = 0;
+	if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS))
+		need_sci = 1;
+	if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS))
+		need_sci = 1;
+	if (need_sci)
+		sci_assert(ctx);
+	else
+		sci_deassert(ctx);
+}
+
 static int
 pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
@@ -76,8 +150,20 @@
 
 	if (bytes != 2)
 		return (-1);
+
+	pthread_mutex_lock(&pm_lock);
 	if (in)
-		*eax = 0;
+		*eax = pm1_status;
+	else {
+		/*
+		 * Writes are only permitted to clear certain bits by
+		 * writing 1 to those flags.
+		 */
+		pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS |
+		    PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS));
+		sci_update(ctx);
+	}
+	pthread_mutex_unlock(&pm_lock);
 	return (0);
 }
 
@@ -85,25 +171,51 @@
 pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
-	static uint16_t pm1_enable;
 
 	if (bytes != 2)
 		return (-1);
+
+	pthread_mutex_lock(&pm_lock);
 	if (in)
 		*eax = pm1_enable;
-	else
-		pm1_enable = *eax;
+	else {
+		/*
+		 * Only permit certain bits to be set.  We never use
+		 * the global lock, but ACPI-CA whines profusely if it
+		 * can't set GBL_EN.
+		 */
+		pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN);
+		sci_update(ctx);
+	}
+	pthread_mutex_unlock(&pm_lock);
 	return (0);
 }
 INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler);
 INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler);
 
+static void
+power_button_handler(int signal, enum ev_type type, void *arg)
+{
+	struct vmctx *ctx;
+
+	ctx = arg;
+	pthread_mutex_lock(&pm_lock);
+	if (!(pm1_status & PM1_PWRBTN_STS)) {
+		pm1_status |= PM1_PWRBTN_STS;
+		sci_update(ctx);
+	}
+	pthread_mutex_unlock(&pm_lock);
+}
+
 /*
  * Power Management 1 Control Register
  *
  * This is mostly unimplemented except that we wish to handle writes that
  * set SPL_EN to handle S5 (soft power off).
  */
+static uint16_t pm1_control;
+
+#define	PM1_SCI_EN	0x0001
 #define	PM1_SLP_TYP	0x1c00
 #define	PM1_SLP_EN	0x2000
 #define	PM1_ALWAYS_ZERO	0xc003
@@ -112,7 +224,6 @@
 pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
-	static uint16_t pm1_control;
 
 	if (bytes != 2)
 		return (-1);
@@ -121,9 +232,11 @@
 	else {
 		/*
 		 * Various bits are write-only or reserved, so force them
-		 * to zero in pm1_control.
+		 * to zero in pm1_control.  Always preserve SCI_EN as OSPM
+		 * can never change it.
 		 */
-		pm1_control = *eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO);
+		pm1_control = (pm1_control & PM1_SCI_EN) |
+		    (*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO));
 
 		/*
 		 * If SLP_EN is set, check for S5.  Bhyve's _S5_ method
@@ -137,3 +250,41 @@
 	return (0);
 }
 INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler);
+
+/*
+ * ACPI SMI Command Register
+ *
+ * This write-only register is used to enable and disable ACPI.
+ */
+static int
+smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+    uint32_t *eax, void *arg)
+{
+
+	assert(!in);
+	if (bytes != 1)
+		return (-1);
+
+	pthread_mutex_lock(&pm_lock);
+	switch (*eax) {
+	case BHYVE_ACPI_ENABLE:
+		pm1_control |= PM1_SCI_EN;
+		if (power_button == NULL) {
+			power_button = mevent_add(SIGTERM, EVF_SIGNAL,
+			    power_button_handler, ctx);
+			old_power_handler = signal(SIGTERM, SIG_IGN);
+		}
+		break;
+	case BHYVE_ACPI_DISABLE:
+		pm1_control &= ~PM1_SCI_EN;
+		if (power_button != NULL) {
+			mevent_delete(power_button);
+			power_button = NULL;
+			signal(SIGTERM, old_power_handler);
+		}
+		break;
+	}
+	pthread_mutex_unlock(&pm_lock);
+	return (0);
+}
+INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler);
--- //depot/vendor/freebsd/src/usr.sbin/bhyve/pmtmr.c
+++ //depot/user/jhb/bhyve/usr.sbin/bhyve/pmtmr.c
@@ -40,6 +40,7 @@
 #include <assert.h>
 #include <pthread.h>
 
+#include "acpi.h"
 #include "inout.h"
 
 /*
@@ -49,11 +50,10 @@
  * This implementation will be 32-bits
  */
 
-#define	IO_PMTMR	0x408	/* 4-byte i/o port for the timer */
-
 #define PMTMR_FREQ	3579545  /* 3.579545MHz */
 
 static pthread_mutex_t pmtmr_mtx;
+static pthread_once_t pmtmr_once = PTHREAD_ONCE_INIT;
 
 static uint64_t	pmtmr_old;
 
@@ -123,6 +123,7 @@
 		pmtmr_uptime_old = tsnew;
 		pmtmr_old = timespec_to_pmtmr(&tsnew, &tsold);
 	}
+	pthread_mutex_init(&pmtmr_mtx, NULL);
 }
 
 static uint32_t
@@ -133,13 +134,7 @@
 	uint64_t	pmtmr_new;
 	int		error;
 
-	static int	inited = 0;
-
-	if (!inited) {
-		pthread_mutex_init(&pmtmr_mtx, NULL);
-		pmtmr_init();
-		inited = 1;
-	}
+	pthread_once(&pmtmr_once, pmtmr_init);
 
 	pthread_mutex_lock(&pmtmr_mtx);
 

> 
> -- 
> John Baldwin
> _______________________________________________
> freebsd-virtualization at freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-virtualization
> To unsubscribe, send any mail to "freebsd-virtualization-unsubscribe at freebsd.org"
> 

-- 
John Baldwin


More information about the freebsd-virtualization mailing list