git: 3e6e4e4a0d42 - main - hwpstate: add CPPC support for pstate driver on AMD

From: ShengYi Hung <aokblast_at_FreeBSD.org>
Date: Sat, 03 Jan 2026 02:53:16 UTC
The branch main has been updated by aokblast:

URL: https://cgit.FreeBSD.org/src/commit/?id=3e6e4e4a0d42fa24f3b2a1c087e9ad25f9594081

commit 3e6e4e4a0d42fa24f3b2a1c087e9ad25f9594081
Author:     ShengYi Hung <aokblast@FreeBSD.org>
AuthorDate: 2025-07-08 11:45:26 +0000
Commit:     ShengYi Hung <aokblast@FreeBSD.org>
CommitDate: 2026-01-03 02:52:51 +0000

    hwpstate: add CPPC support for pstate driver on AMD
    
    Implement CPPC interface for AMD Pstate Driver.
    This feature is only enabled when the CPUID shows it support CPPC.
    
    The CPPC is implemneted by the following steps:
    
    1. Write MSR to enable it.
    2. Read capability registert which indicates binary value of levels
    about lowest, best energy efficient, guarantee, and max performance.
    3. Write request register with epp in energy balanced mode. And let
    CPU and firmware to enter autonomous mode.
    
    Also, create a sysctl handler to allow userspace to change epp value.
    In intel's hwpstate, The epp value can be in package level and core level.
    However, in AMD's one, there is only core level. Thus, to sync with the
    intel's code, we implement package level control in software and provide
    another sysctl (machdep.hwpstate_pkg_ctrl) to control it.
    
    Reviewed by:    olce, khng
    Approved by:    lwhsu (mentor)
    MFC after:      2 weeks
    Sponsored by:   The FreeBSD Foundation
    Differential Revision: https://reviews.freebsd.org/D49587
---
 sys/x86/cpufreq/hwpstate_amd.c | 361 ++++++++++++++++++++++++++++++++++++++---
 sys/x86/include/specialreg.h   |   1 +
 2 files changed, 337 insertions(+), 25 deletions(-)

diff --git a/sys/x86/cpufreq/hwpstate_amd.c b/sys/x86/cpufreq/hwpstate_amd.c
index fc948dc90a15..4395e43a219f 100644
--- a/sys/x86/cpufreq/hwpstate_amd.c
+++ b/sys/x86/cpufreq/hwpstate_amd.c
@@ -8,6 +8,7 @@
  * Copyright (c) 2009 Michael Reifenberger
  * Copyright (c) 2009 Norikatsu Shigemura
  * Copyright (c) 2008-2009 Gen Otsuji
+ * Copyright (c) 2025 ShengYi Hung
  *
  * This code is depending on kern_cpu.c, est.c, powernow.c, p4tcc.c, smist.c
  * in various parts. The authors of these files are Nate Lawson,
@@ -55,6 +56,7 @@
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/pcpu.h>
+#include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 
@@ -74,6 +76,15 @@
 #define	MSR_AMD_10H_11H_STATUS	0xc0010063
 #define	MSR_AMD_10H_11H_CONFIG	0xc0010064
 
+#define	MSR_AMD_CPPC_CAPS_1	0xc00102b0
+#define	MSR_AMD_CPPC_ENABLE	0xc00102b1
+#define	MSR_AMD_CPPC_CAPS_2	0xc00102b2
+#define	MSR_AMD_CPPC_REQUEST	0xc00102b3
+#define	MSR_AMD_CPPC_STATUS	0xc00102b4
+
+#define	MSR_AMD_PWR_ACC		0xc001007a
+#define	MSR_AMD_PWR_ACC_MX	0xc001007b
+
 #define	AMD_10H_11H_MAX_STATES	16
 
 /* for MSR_AMD_10H_11H_LIMIT C001_0061 */
@@ -92,6 +103,23 @@
 
 #define	AMD_1AH_CUR_FID(msr)			((msr) & 0xFFF)
 
+#define	AMD_CPPC_CAPS_1_HIGH_PERF_BITS		0xff000000
+#define	AMD_CPPC_CAPS_1_NOMINAL_PERF_BITS	0x00ff0000
+#define	AMD_CPPC_CAPS_1_LOW_NONLIN_PERF_BITS	0x0000ff00
+#define	AMD_CPPC_CAPS_1_LOW_PERF_BITS		0x000000ff
+
+#define	AMD_CPPC_REQUEST_ENERGY_PERF_BITS	0xff000000
+#define	AMD_CPPC_REQUEST_DES_PERF_BITS		0x00ff0000
+#define	AMD_CPPC_REQUEST_MIN_PERF_BITS		0x0000ff00
+#define	AMD_CPPC_REQUEST_MAX_PERF_BITS		0x000000ff
+
+#define	HWP_AMD_CLASSNAME			"hwpstate_amd"
+
+#define	BITS_VALUE(bits, num)			(((num) & (bits)) >> (ffsll((bits)) - 1))
+#define	BITS_WITH_VALUE(bits, val)		((uintmax_t)(val) << (ffsll((bits)) - 1))
+#define	SET_BITS_VALUE(var, bits, val) \
+	((var) = (var) & ~(bits) | BITS_WITH_VALUE((bits), (val)))
+
 #define	HWPSTATE_DEBUG(dev, msg...)			\
 	do {						\
 		if (hwpstate_verbose)			\
@@ -106,10 +134,16 @@ struct hwpstate_setting {
 	int	pstate_id;	/* P-State id */
 };
 
+enum hwpstate_flags {
+	PSTATE_CPPC = 1,
+};
+
 struct hwpstate_softc {
 	device_t		dev;
-	struct hwpstate_setting	hwpstate_settings[AMD_10H_11H_MAX_STATES];
+	struct hwpstate_setting hwpstate_settings[AMD_10H_11H_MAX_STATES];
 	int			cfnum;
+	uint32_t flags;
+	uint64_t req;
 };
 
 static void	hwpstate_identify(driver_t *driver, device_t parent);
@@ -140,6 +174,11 @@ SYSCTL_BOOL(_debug, OID_AUTO, hwpstate_pstate_limit, CTLFLAG_RWTUN,
     "If enabled (1), limit administrative control of P-states to the value in "
     "CurPstateLimit");
 
+static bool hwpstate_pkg_ctrl_enable = true;
+SYSCTL_BOOL(_machdep, OID_AUTO, hwpstate_pkg_ctrl, CTLFLAG_RDTUN,
+    &hwpstate_pkg_ctrl_enable, 0,
+    "Set 1 (default) to enable package-level control, 0 to disable");
+
 static device_method_t hwpstate_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	hwpstate_identify),
@@ -159,8 +198,154 @@ static device_method_t hwpstate_methods[] = {
 	{0, 0}
 };
 
+static int
+amdhwp_dump_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+	device_t dev;
+	struct pcpu *pc;
+	struct sbuf *sb;
+	struct hwpstate_softc *sc;
+	uint64_t data;
+	int ret;
+
+	sc = (struct hwpstate_softc *)arg1;
+	dev = sc->dev;
+
+	pc = cpu_get_pcpu(dev);
+	if (pc == NULL)
+		return (ENXIO);
+
+	sb = sbuf_new(NULL, NULL, 1024, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
+	sbuf_putc(sb, '\n');
+	thread_lock(curthread);
+	sched_bind(curthread, pc->pc_cpuid);
+	thread_unlock(curthread);
+
+	rdmsr_safe(MSR_AMD_CPPC_ENABLE, &data);
+	sbuf_printf(sb, "CPU%d: HWP %sabled\n", pc->pc_cpuid,
+	    ((data & 1) ? "En" : "Dis"));
+
+	if (data == 0) {
+		ret = 0;
+		goto out;
+	}
+
+	rdmsr_safe(MSR_AMD_CPPC_CAPS_1, &data);
+	sbuf_printf(sb, "\tHighest Performance: %03ju\n",
+	    BITS_VALUE(AMD_CPPC_CAPS_1_HIGH_PERF_BITS, data));
+	sbuf_printf(sb, "\tGuaranteed Performance: %03ju\n",
+	    BITS_VALUE(AMD_CPPC_CAPS_1_NOMINAL_PERF_BITS, data));
+	sbuf_printf(sb, "\tEfficient Performance: %03ju\n",
+	    BITS_VALUE(AMD_CPPC_CAPS_1_LOW_NONLIN_PERF_BITS, data));
+	sbuf_printf(sb, "\tLowest Performance: %03ju\n",
+	    BITS_VALUE(AMD_CPPC_CAPS_1_LOW_PERF_BITS, data));
+	sbuf_putc(sb, '\n');
+
+	rdmsr_safe(MSR_AMD_CPPC_REQUEST, &data);
+
+#define pkg_print(name, offset)                         \
+	do {                                            \
+		sbuf_printf(sb, "\t%s: %03u\n", name,   \
+		    (unsigned)(data >> offset) & 0xff); \
+	} while (0)
+
+	pkg_print("Requested Efficiency Performance Preference", 24);
+	pkg_print("Requested Desired Performance", 16);
+	pkg_print("Requested Maximum Performance", 8);
+	pkg_print("Requested Minimum Performance", 0);
+#undef pkg_print
+
+	sbuf_putc(sb, '\n');
+
+out:
+	thread_lock(curthread);
+	sched_unbind(curthread);
+	thread_unlock(curthread);
+
+	ret = sbuf_finish(sb);
+	if (ret == 0)
+		ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
+	sbuf_delete(sb);
+
+	return (ret);
+}
+
+static bool
+sysctl_epp_select_per_core(const device_t hwp_device, uint32_t val)
+{
+	struct hwpstate_softc *sc;
+	bool success = true;
+	int ret, cpuid;
+
+	cpuid = cpu_get_pcpu(hwp_device)->pc_cpuid;
+	thread_lock(curthread);
+	sched_bind(curthread, cpuid);
+	thread_unlock(curthread);
+	sc = device_get_softc(hwp_device);
+	if (BITS_VALUE(AMD_CPPC_REQUEST_ENERGY_PERF_BITS, sc->req) == val)
+		goto end;
+	SET_BITS_VALUE(sc->req, AMD_CPPC_REQUEST_ENERGY_PERF_BITS, val);
+	ret = wrmsr_safe(MSR_AMD_CPPC_REQUEST, sc->req);
+	if (ret != 0) {
+		success = false;
+		device_printf(hwp_device, "Failed to set EPP to %u", val);
+		goto end;
+	}
+
+end:
+	thread_lock(curthread);
+	sched_unbind(curthread);
+	thread_unlock(curthread);
+
+	return (success);
+}
+
+static int
+sysctl_epp_select(SYSCTL_HANDLER_ARGS)
+{
+	device_t dev, hwp_dev;
+	struct hwpstate_softc *sc;
+	const uint32_t max_energy_perf =
+	    BITS_VALUE(AMD_CPPC_REQUEST_ENERGY_PERF_BITS, (uint64_t)-1);
+	devclass_t dc;
+	uint32_t val;
+	int ret = 0;
+	int cpu;
+
+	dev = oidp->oid_arg1;
+	sc = device_get_softc(dev);
+
+	if (!(sc->flags & PSTATE_CPPC))
+		return (ENODEV);
+
+	val = BITS_VALUE(AMD_CPPC_REQUEST_ENERGY_PERF_BITS, sc->req) * 100 /
+	    max_energy_perf;
+	ret = sysctl_handle_int(oidp, &val, 0, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto end;
+	if (val > 100) {
+		ret = EINVAL;
+		goto end;
+	}
+	val = (val * max_energy_perf) / 100;
+
+	if (hwpstate_pkg_ctrl_enable) {
+		dc = devclass_find(HWP_AMD_CLASSNAME);
+		KASSERT(dc != NULL,
+		    (HWP_AMD_CLASSNAME ": devclass cannot be null"));
+		CPU_FOREACH(cpu) {
+			hwp_dev = devclass_get_device(dc, cpu);
+			sysctl_epp_select_per_core(hwp_dev, val);
+		}
+	} else
+		sysctl_epp_select_per_core(dev, val);
+
+end:
+	return (ret);
+}
+
 static driver_t hwpstate_driver = {
-	"hwpstate",
+	HWP_AMD_CLASSNAME,
 	hwpstate_methods,
 	sizeof(struct hwpstate_softc),
 };
@@ -269,6 +454,8 @@ hwpstate_set(device_t dev, const struct cf_setting *cf)
 	if (cf == NULL)
 		return (EINVAL);
 	sc = device_get_softc(dev);
+	if (sc->flags & PSTATE_CPPC)
+		return (EOPNOTSUPP);
 	set = sc->hwpstate_settings;
 	for (i = 0; i < sc->cfnum; i++)
 		if (CPUFREQ_CMP(cf->freq, set[i].freq))
@@ -284,21 +471,38 @@ hwpstate_get(device_t dev, struct cf_setting *cf)
 {
 	struct hwpstate_softc *sc;
 	struct hwpstate_setting set;
+	struct pcpu *pc;
 	uint64_t msr;
+	uint64_t rate;
+	int ret;
 
 	sc = device_get_softc(dev);
 	if (cf == NULL)
 		return (EINVAL);
-	msr = rdmsr(MSR_AMD_10H_11H_STATUS);
-	if (msr >= sc->cfnum)
-		return (EINVAL);
-	set = sc->hwpstate_settings[msr];
 
-	cf->freq = set.freq;
-	cf->volts = set.volts;
-	cf->power = set.power;
-	cf->lat = set.lat;
-	cf->dev = dev;
+	if (sc->flags & PSTATE_CPPC) {
+		pc = cpu_get_pcpu(dev);
+		if (pc == NULL)
+			return (ENXIO);
+
+		memset(cf, CPUFREQ_VAL_UNKNOWN, sizeof(*cf));
+		cf->dev = dev;
+		if ((ret = cpu_est_clockrate(pc->pc_cpuid, &rate)))
+			return (ret);
+		cf->freq = rate / 1000000;
+	} else {
+		msr = rdmsr(MSR_AMD_10H_11H_STATUS);
+		if (msr >= sc->cfnum)
+			return (EINVAL);
+		set = sc->hwpstate_settings[msr];
+
+		cf->freq = set.freq;
+		cf->volts = set.volts;
+		cf->power = set.power;
+		cf->lat = set.lat;
+		cf->dev = dev;
+	}
+
 	return (0);
 }
 
@@ -312,6 +516,9 @@ hwpstate_settings(device_t dev, struct cf_setting *sets, int *count)
 	if (sets == NULL || count == NULL)
 		return (EINVAL);
 	sc = device_get_softc(dev);
+	if (sc->flags & PSTATE_CPPC)
+		return (EOPNOTSUPP);
+
 	if (*count < sc->cfnum)
 		return (E2BIG);
 	for (i = 0; i < sc->cfnum; i++, sets++) {
@@ -330,19 +537,24 @@ hwpstate_settings(device_t dev, struct cf_setting *sets, int *count)
 static int
 hwpstate_type(device_t dev, int *type)
 {
+	struct hwpstate_softc *sc;
 
 	if (type == NULL)
 		return (EINVAL);
+	sc = device_get_softc(dev);
 
 	*type = CPUFREQ_TYPE_ABSOLUTE;
+	*type |= sc->flags & PSTATE_CPPC ?
+	    CPUFREQ_FLAG_INFO_ONLY | CPUFREQ_FLAG_UNCACHED :
+	    0;
 	return (0);
 }
 
 static void
 hwpstate_identify(driver_t *driver, device_t parent)
 {
-
-	if (device_find_child(parent, "hwpstate", DEVICE_UNIT_ANY) != NULL)
+	if (device_find_child(parent, HWP_AMD_CLASSNAME, DEVICE_UNIT_ANY) !=
+	    NULL)
 		return;
 
 	if ((cpu_vendor_id != CPU_VENDOR_AMD || CPUID_TO_FAMILY(cpu_id) < 0x10) &&
@@ -357,14 +569,82 @@ hwpstate_identify(driver_t *driver, device_t parent)
 		return;
 	}
 
-	if (resource_disabled("hwpstate", 0))
+	if (resource_disabled(HWP_AMD_CLASSNAME, 0))
 		return;
 
-	if (BUS_ADD_CHILD(parent, 10, "hwpstate", device_get_unit(parent))
-	    == NULL)
+	if (BUS_ADD_CHILD(parent, 10, HWP_AMD_CLASSNAME,
+		device_get_unit(parent)) == NULL)
 		device_printf(parent, "hwpstate: add child failed\n");
 }
 
+static int
+amd_set_autonomous_hwp(struct hwpstate_softc *sc)
+{
+	struct pcpu *pc;
+	device_t dev;
+	uint64_t caps;
+	int ret;
+
+	dev = sc->dev;
+	pc = cpu_get_pcpu(dev);
+	if (pc == NULL)
+		return (ENXIO);
+
+	thread_lock(curthread);
+	sched_bind(curthread, pc->pc_cpuid);
+	thread_unlock(curthread);
+
+	ret = wrmsr_safe(MSR_AMD_CPPC_ENABLE, 1);
+	if (ret != 0) {
+		device_printf(dev, "Failed to enable cppc for cpu%d (%d)\n",
+		    pc->pc_cpuid, ret);
+		goto out;
+	}
+
+	ret = rdmsr_safe(MSR_AMD_CPPC_REQUEST, &sc->req);
+	if (ret != 0) {
+		device_printf(dev,
+		    "Failed to read CPPC request MSR for cpu%d (%d)\n",
+		    pc->pc_cpuid, ret);
+		goto out;
+	}
+
+	ret = rdmsr_safe(MSR_AMD_CPPC_CAPS_1, &caps);
+	if (ret != 0) {
+		device_printf(dev,
+		    "Failed to read HWP capabilities MSR for cpu%d (%d)\n",
+		    pc->pc_cpuid, ret);
+		goto out;
+	}
+
+	/*
+	 * In Intel's reference manual, the default value of EPP is 0x80u which
+	 * is the balanced mode. For consistency, we set the same value in AMD's
+	 * CPPC driver.
+	 */
+	SET_BITS_VALUE(sc->req, AMD_CPPC_REQUEST_ENERGY_PERF_BITS, 0x80);
+	SET_BITS_VALUE(sc->req, AMD_CPPC_REQUEST_MIN_PERF_BITS,
+	    BITS_VALUE(AMD_CPPC_CAPS_1_LOW_PERF_BITS, caps));
+	SET_BITS_VALUE(sc->req, AMD_CPPC_REQUEST_MAX_PERF_BITS,
+	    BITS_VALUE(AMD_CPPC_CAPS_1_HIGH_PERF_BITS, caps));
+	/* enable autonomous mode by setting desired performance to 0 */
+	SET_BITS_VALUE(sc->req, AMD_CPPC_REQUEST_DES_PERF_BITS, 0);
+
+	ret = wrmsr_safe(MSR_AMD_CPPC_REQUEST, sc->req);
+	if (ret) {
+		device_printf(dev,
+		    "Failed to setup autonomous HWP for cpu%d\n",
+		    pc->pc_cpuid);
+		goto out;
+	}
+out:
+	thread_lock(curthread);
+	sched_unbind(curthread);
+	thread_unlock(curthread);
+
+	return (ret);
+}
+
 static int
 hwpstate_probe(device_t dev)
 {
@@ -373,15 +653,25 @@ hwpstate_probe(device_t dev)
 	uint64_t msr;
 	int error, type;
 
-	/*
-	 * Only hwpstate0.
-	 * It goes well with acpi_throttle.
-	 */
-	if (device_get_unit(dev) != 0)
-		return (ENXIO);
-
 	sc = device_get_softc(dev);
+
+	if (amd_extended_feature_extensions & AMDFEID_CPPC) {
+		sc->flags |= PSTATE_CPPC;
+		device_set_desc(dev,
+		    "AMD Collaborative Processor Performance Control (CPPC)");
+	} else {
+		/*
+		 * No CPPC support.  Only keep hwpstate0, it goes well with
+		 * acpi_throttle.
+		 */
+		if (device_get_unit(dev) != 0)
+			return (ENXIO);
+		device_set_desc(dev, "Cool`n'Quiet 2.0");
+	}
+
 	sc->dev = dev;
+	if (sc->flags & PSTATE_CPPC)
+		return (0);
 
 	/*
 	 * Check if acpi_perf has INFO only flag.
@@ -433,14 +723,32 @@ hwpstate_probe(device_t dev)
 	if (error)
 		return (error);
 
-	device_set_desc(dev, "Cool`n'Quiet 2.0");
 	return (0);
 }
 
 static int
 hwpstate_attach(device_t dev)
 {
+	struct hwpstate_softc *sc;
+	int res;
 
+	sc = device_get_softc(dev);
+	if (sc->flags & PSTATE_CPPC) {
+		if ((res = amd_set_autonomous_hwp(sc)))
+			return res;
+		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
+		    SYSCTL_STATIC_CHILDREN(_debug), OID_AUTO,
+		    device_get_nameunit(dev),
+		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
+		    sc, 0, amdhwp_dump_sysctl_handler, "A", "");
+
+		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
+		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
+		    "epp", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, dev, 0,
+		    sysctl_epp_select, "I",
+		    "Efficiency/Performance Preference "
+		    "(range from 0, most performant, through 100, most efficient)");
+	}
 	return (cpufreq_register(dev));
 }
 
@@ -584,8 +892,11 @@ out:
 static int
 hwpstate_detach(device_t dev)
 {
+	struct hwpstate_softc *sc;
 
-	hwpstate_goto_pstate(dev, 0);
+	sc = device_get_softc(dev);
+	if (!(sc->flags & PSTATE_CPPC))
+		hwpstate_goto_pstate(dev, 0);
 	return (cpufreq_unregister(dev));
 }
 
diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h
index e9dde5c3b46a..3e5f598cd82a 100644
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@@ -418,6 +418,7 @@
 #define	AMDPM_HW_PSTATE		0x00000080
 #define	AMDPM_TSC_INVARIANT	0x00000100
 #define	AMDPM_CPB		0x00000200
+#define	AMDPM_PWR_REPORT	0x00001000
 
 /*
  * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions)