[PATCH] Untangle TPR shadowing and APIC virtualization / Make Win guests on Bhyve _fast_

Yamagi lists at yamagi.org
Sat Dec 21 19:26:05 UTC 2019


Hi,
a long known problem with Bhyve is that Windows guests are rather slow.
With Windows 10 1903 this became much worse, to the point that the
guest is unusable. I have found the reason for this: Windows hammers on
the %cr8 control register. For example, Windows 10 1909 on an i7-2620M
has about 68,000 %cr8 accesses per second. Each of them triggers a vm
exit.

The most common solution is TPR shadowing. Many thanks to royger in
#bhyve for getting me on the right track. Bhyve already implements TPR
shadowing. On AMD SVM it just works, but the implementation for Intel
VT-x is bound to APIC virtualization. And APIC virtualization is a Xeon
feature that is missing on most (all?) desktop CPUs.

The patch - further down inline or under [0] - separates TPR shadowing
from APIC virtualization, so TPR shadowing can be used on desktop CPUs
as well. The patch doesn't just give a small speed boost, it's a
difference like day and night. As an example, without the patch, the
installation of Windows 10 1909 takes about 2280 seconds from start to
first reboot. With the patch, only 370 seconds. On an old Thinkpad
X220, Windows 10 guests were previously unusable, now they are resonable
fast.

The patch does:

* Add a new tuneable 'hw.vmm.vmx.use_tpr_shadowing' to disable TLP
  shadowing. Also add 'hw.vmm.vmx.cap.tpr_shadowing' to be able to query
  if TPR shadowing is used.

* Detach the initialization of TPR shadowing from the initialization of
  APIC virtualization. APIC virtualization still needs TPR shadowing,
  but not vice versa. Any CPU that supports APIC virtualization should
  also support TPR shadowing.

* When TPR shadowing is used, the APIC page of each vCPU is written to
  the VMCS_VIRTUAL_APIC field of the VMCS so that the CPU can write
  directly to the page without intercept.

* On vm exit, vlapic_update_ppr() is called to update the PPR.

The patch was tested on an i7-2620M, an i7-6700k and a Xeon Silver
4110. Both Windows and FreeBSD guests work correctly.

Regards,
Yamagi

0: https://gist.github.com/Yamagi/de70c08eadeeef14eec4cb42aeb5957f

----

diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 605fd0bda766..324a1e9d0c3c 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -172,6 +172,10 @@ static int cap_invpcid;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
     0, "Guests are allowed to use INVPCID");
 
+static int tpr_shadowing;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD,
+    &tpr_shadowing, 0, "TPR shadowin support");
+
 static int virtual_interrupt_delivery;
 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
@@ -627,7 +631,7 @@ vmx_restore(void)
 static int
 vmx_init(int ipinum)
 {
-	int error, use_tpr_shadow;
+	int error;
 	uint64_t basic, fixed0, fixed1, feature_control;
 	uint32_t tmp, procbased2_vid_bits;
 
@@ -750,6 +754,24 @@ vmx_init(int ipinum)
 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
 	    &tmp) == 0);
 
+	/*
+	 * Check support for TPR shadow.
+	 */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
+	    &tmp);
+	if (error == 0) {
+		tpr_shadowing = 1;
+		TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing",
+		    &tpr_shadowing);
+	}
+
+	if (tpr_shadowing) {
+		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
+		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
+		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
+	}
+
 	/*
 	 * Check support for virtual interrupt delivery.
 	 */
@@ -758,13 +780,9 @@ vmx_init(int ipinum)
 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
 
-	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
-	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
-	    &tmp) == 0);
-
 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
 	    procbased2_vid_bits, 0, &tmp);
-	if (error == 0 && use_tpr_shadow) {
+	if (error == 0 && tpr_shadowing) {
 		virtual_interrupt_delivery = 1;
 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
 		    &virtual_interrupt_delivery);
@@ -775,13 +793,6 @@ vmx_init(int ipinum)
 		procbased_ctls2 |= procbased2_vid_bits;
 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 
-		/*
-		 * No need to emulate accesses to %CR8 if virtual
-		 * interrupt delivery is enabled.
-		 */
-		procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
-		procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
-
 		/*
 		 * Check for Posted Interrupts only if Virtual Interrupt
 		 * Delivery is enabled.
@@ -1051,10 +1062,13 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
 		error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
 
-		if (virtual_interrupt_delivery) {
-			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
+		if (tpr_shadowing) {
 			error += vmwrite(VMCS_VIRTUAL_APIC,
 			    vtophys(&vmx->apic_page[i]));
+		}
+
+		if (virtual_interrupt_delivery) {
+			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 			error += vmwrite(VMCS_EOI_EXIT0, 0);
 			error += vmwrite(VMCS_EOI_EXIT1, 0);
 			error += vmwrite(VMCS_EOI_EXIT2, 0);
@@ -2313,6 +2327,14 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		}
 	}
 
+	/*
+	 * If 'TPR shadowing' is used, update the local APICs PPR.
+	 */
+	if (tpr_shadowing) {
+		vlapic = vm_lapic(vmx->vm, vcpu);
+		vlapic_update_ppr(vlapic);
+	}
+
 	switch (reason) {
 	case EXIT_REASON_TASK_SWITCH:
 		ts = &vmexit->u.task_switch;
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 74e6cd967396..289fdb7e077d 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -490,7 +490,7 @@ dump_isrvec_stk(struct vlapic *vlapic)
  * Algorithm adopted from section "Interrupt, Task and Processor Priority"
  * in Intel Architecture Manual Vol 3a.
  */
-static void
+void
 vlapic_update_ppr(struct vlapic *vlapic)
 {
 	int isrvec, tpr, ppr;
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index 2a5f54003253..71b97feab6bc 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -74,6 +74,8 @@ void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum);
 void vlapic_fire_cmci(struct vlapic *vlapic);
 int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
 
+void vlapic_update_ppr(struct vlapic *vlapic);
+
 uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
 int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
 void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);

-- 
Homepage: https://www.yamagi.org
Github:   https://github.com/yamagi
GPG:      0x1D502515
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <http://lists.freebsd.org/pipermail/freebsd-virtualization/attachments/20191221/86499805/attachment.sig>


More information about the freebsd-virtualization mailing list