git: 16e5abf415ba - main - APEI: Provide more info on fatal hardware errors
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sat, 06 Jun 2026 00:16:43 UTC
The branch main has been updated by gallatin:
URL: https://cgit.FreeBSD.org/src/commit/?id=16e5abf415baf801c6d7c7948a742aeda75e2237
commit 16e5abf415baf801c6d7c7948a742aeda75e2237
Author: Andrew Gallatin <gallatin@FreeBSD.org>
AuthorDate: 2026-06-06 00:07:03 +0000
Commit: Andrew Gallatin <gallatin@FreeBSD.org>
CommitDate: 2026-06-06 00:12:21 +0000
APEI: Provide more info on fatal hardware errors
This change refactors fatal error delivery via APEI and prints more info:
- Makes the NMI handler call into the ge handler to establish a common
code flow, no matter how the error is delivered
- Adds the FRU to the panic string so as to provide more information than
just "APEI Fatal Hardware Error!" such as
"APEI Fatal Hardware Error: PcieError"
- Prints more details about fatal pcie errors. Note that we skip acquiring
Giant on fatal errors
- Hexdumps the full GED data on fatal errors, so as to facilitate
offline data analysis
Reviewed by: imp
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D57417
---
sys/dev/acpica/acpi_apei.c | 53 ++++++++++++++++++++++++++++++++--------------
1 file changed, 37 insertions(+), 16 deletions(-)
diff --git a/sys/dev/acpica/acpi_apei.c b/sys/dev/acpica/acpi_apei.c
index e85b3910e46d..925558d585bf 100644
--- a/sys/dev/acpica/acpi_apei.c
+++ b/sys/dev/acpica/acpi_apei.c
@@ -237,7 +237,7 @@ apei_mem_handler(ACPI_HEST_GENERIC_DATA *ged)
}
static int
-apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
+apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged, bool fatal)
{
struct apei_pcie_error *p = (struct apei_pcie_error *)GED_DATA(ged);
int off;
@@ -246,7 +246,8 @@ apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
int h = 0, sev;
if ((p->ValidationBits & 0x8) == 0x8) {
- mtx_lock(&Giant);
+ if (!fatal)
+ mtx_lock(&Giant);
dev = pci_find_dbsf((uint32_t)p->DeviceID[10] << 8 |
p->DeviceID[9], p->DeviceID[11], p->DeviceID[8],
p->DeviceID[7]);
@@ -264,9 +265,11 @@ apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
}
pcie_apei_error(dev, sev,
(p->ValidationBits & 0x80) ? p->AERInfo : NULL);
- h = 1;
+ if (!fatal)
+ h = 1;
}
- mtx_unlock(&Giant);
+ if (!fatal)
+ mtx_unlock(&Giant);
}
if (h)
return (h);
@@ -322,8 +325,8 @@ apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
return (0);
}
-static void
-apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
+static const char *
+apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged, bool fatal)
{
ACPI_HEST_GENERIC_DATA_V300 *ged3 = (ACPI_HEST_GENERIC_DATA_V300 *)ged;
/* A5BC1114-6F64-4EDE-B863-3E83ED7C83B1 */
@@ -342,12 +345,12 @@ apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
if (memcmp(mem_uuid, ged->SectionType, ACPI_UUID_LENGTH) == 0) {
h = apei_mem_handler(ged);
} else if (memcmp(pcie_uuid, ged->SectionType, ACPI_UUID_LENGTH) == 0) {
- h = apei_pcie_handler(ged);
+ h = apei_pcie_handler(ged, fatal);
} else {
if (!log_corrected &&
(ged->ErrorSeverity == ACPI_HEST_GEN_ERROR_CORRECTED ||
ged->ErrorSeverity == ACPI_HEST_GEN_ERROR_NONE))
- return;
+ return (NULL);
t = ged->SectionType;
printf("APEI %s Error %02x%02x%02x%02x-%02x%02x-"
@@ -364,7 +367,7 @@ apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
}
}
if (h)
- return;
+ return (NULL);
printf(" Flags: 0x%x\n", ged->Flags);
if (ged->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) {
@@ -379,6 +382,19 @@ apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
if (ged->Revision >= 0x300 &&
ged->ValidationBits & ACPI_HEST_GEN_VALID_TIMESTAMP)
printf(" Timestamp: %016jx\n", ged3->TimeStamp);
+ if (fatal) {
+ printf(" Error Data:\n");
+ t = (uint8_t *)GED_DATA(ged);
+ for (off = 0; off < ged->ErrorDataLength; off++) {
+ printf(" %02x", t[off]);
+ if ((off % 16) == 15 ||
+ off + 1 == ged->ErrorDataLength)
+ printf("\n");
+ }
+ }
+ if (ged->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING)
+ return ((const char *)ged->FruText);
+ return (NULL);
}
static int
@@ -387,23 +403,27 @@ apei_ge_handler(struct apei_ge *ge, bool copy)
uint8_t *buf = copy ? ge->copybuf : ge->buf;
ACPI_HEST_GENERIC_STATUS *ges = (ACPI_HEST_GENERIC_STATUS *)buf;
ACPI_HEST_GENERIC_DATA *ged;
+ const char *fru, *f;
size_t off, len;
- uint32_t sev;
int i, c;
+ bool fatal;
if (ges == NULL || ges->BlockStatus == 0)
return (0);
c = (ges->BlockStatus >> 4) & 0x3ff;
- sev = ges->ErrorSeverity;
+ fatal = (ges->ErrorSeverity == ACPI_HEST_GEN_ERROR_FATAL);
/* Process error entries. */
+ fru = NULL;
len = MIN(ge->v1.ErrorBlockLength - sizeof(*ges), ges->DataLength);
for (off = i = 0; i < c && off + sizeof(*ged) <= len; i++) {
ged = (ACPI_HEST_GENERIC_DATA *)&buf[sizeof(*ges) + off];
if ((uint64_t)GED_SIZE(ged) + ged->ErrorDataLength > len - off)
break;
- apei_ged_handler(ged);
+ f = apei_ged_handler(ged, fatal);
+ if (f != NULL && fru == NULL)
+ fru = f;
off += GED_SIZE(ged) + ged->ErrorDataLength;
}
@@ -418,8 +438,9 @@ apei_ge_handler(struct apei_ge *ge, bool copy)
}
/* If ACPI told the error is fatal -- make it so. */
- if (sev == ACPI_HEST_GEN_ERROR_FATAL)
- panic("APEI Fatal Hardware Error!");
+ if (fatal)
+ panic("APEI Fatal Hardware Error: %.20s",
+ fru != NULL ? fru : "unknown");
return (1);
}
@@ -450,9 +471,9 @@ apei_nmi_handler(void)
if (ges == NULL || ges->BlockStatus == 0)
continue;
- /* If ACPI told the error is fatal -- make it so. */
+ /* Log and panic via apei_ge_handler(); does not return. */
if (ges->ErrorSeverity == ACPI_HEST_GEN_ERROR_FATAL)
- panic("APEI Fatal Hardware Error!");
+ apei_ge_handler(ge, false);
/* Copy the buffer for later processing. */
gesc = (ACPI_HEST_GENERIC_STATUS *)ge->copybuf;