svn commit: r364003 - in stable/12/sys: arm64/arm64 arm64/include conf dev/acpica dev/pci x86/include x86/x86

Alexander Motin mav at FreeBSD.org
Fri Aug 7 15:59:47 UTC 2020


I'm sorry, it was my fault.  Should be fixed by r364020.  But I'd still
look on the data just out of curiosity.

On 07.08.2020 09:12, Alexander Motin wrote:
> Do you mean you are getting two apeiX devices?  That is unexpected.
> Could you send me `devinfo -vr` and `acpidump -t` outputs?  And lets go
> private for debugging.
> 
> On 07.08.2020 04:58, Jack L. wrote:
>> I'm getting
>> apei1 platform error interface
>> after installing the kernel with this commit on multiple Dell R610 servers.
>>
>> On Thu, Aug 6, 2020 at 6:05 PM Alexander Motin <mav at freebsd.org> wrote:
>>>
>>> Author: mav
>>> Date: Fri Aug  7 01:05:10 2020
>>> New Revision: 364003
>>> URL: https://svnweb.freebsd.org/changeset/base/364003
>>>
>>> Log:
>>>   MFC r363624: Add initial driver for ACPI Platform Error Interfaces.
>>>
>>>   APEI allows platform to report different kinds of errors to OS in several
>>>   ways.  We've found that Supermicro X10/X11 motherboards report PCIe errors
>>>   appearing on hot-unplug via this interface using NMI.  Without respective
>>>   driver it ended up in kernel panic without any additional information.
>>>
>>>   This driver introduces support for the APEI Generic Hardware Error Source
>>>   reporting via NMI, SCI or polling.  It decodes the reported errors and
>>>   either pass them to pci(4) for processing or just logs otherwise.  Errors
>>>   marked as fatal still end up in kernel panic, but some more informative.
>>>
>>>   When somebody get to native PCIe AER support implementation both of the
>>>   reporting mechanisms should get common error recovery code.  Since in our
>>>   case errors happen when the device is already gone, there is nothing to
>>>   recover, so the code just clears the error statuses, practically ignoring
>>>   the otherwise destructive NMIs in nicer way.
>>>
>>>   Relnotes:     yes
>>>   Sponsored by: iXsystems, Inc.
>>>
>>> Added:
>>>   stable/12/sys/dev/acpica/acpi_apei.c
>>>      - copied, changed from r363624, head/sys/dev/acpica/acpi_apei.c
>>> Modified:
>>>   stable/12/sys/arm64/arm64/machdep.c
>>>   stable/12/sys/arm64/include/acpica_machdep.h
>>>   stable/12/sys/conf/files
>>>   stable/12/sys/dev/acpica/acpi.c
>>>   stable/12/sys/dev/pci/pci.c
>>>   stable/12/sys/dev/pci/pcivar.h
>>>   stable/12/sys/x86/include/acpica_machdep.h
>>>   stable/12/sys/x86/x86/cpu_machdep.c
>>> Directory Properties:
>>>   stable/12/   (props changed)
>>>
>>> Modified: stable/12/sys/arm64/arm64/machdep.c
>>> ==============================================================================
>>> --- stable/12/sys/arm64/arm64/machdep.c Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/arm64/arm64/machdep.c Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -131,6 +131,8 @@ void pagezero_cache(void *);
>>>  /* pagezero_simple is default pagezero */
>>>  void (*pagezero)(void *p) = pagezero_simple;
>>>
>>> +int (*apei_nmi)(void);
>>> +
>>>  static void
>>>  pan_setup(void)
>>>  {
>>>
>>> Modified: stable/12/sys/arm64/include/acpica_machdep.h
>>> ==============================================================================
>>> --- stable/12/sys/arm64/include/acpica_machdep.h        Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/arm64/include/acpica_machdep.h        Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -57,6 +57,8 @@ struct acpi_generic_address;
>>>  int    acpi_map_addr(struct acpi_generic_address  *, bus_space_tag_t *,
>>>      bus_space_handle_t *, bus_size_t);
>>>
>>> +extern int (*apei_nmi)(void);
>>> +
>>>  #endif /* _KERNEL */
>>>
>>>  #endif /* __ACPICA_MACHDEP_H__ */
>>>
>>> Modified: stable/12/sys/conf/files
>>> ==============================================================================
>>> --- stable/12/sys/conf/files    Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/conf/files    Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -740,6 +740,7 @@ dev/acpica/Osd/OsdSynch.c   optional acpi
>>>  dev/acpica/Osd/OsdTable.c      optional acpi
>>>  dev/acpica/acpi.c              optional acpi
>>>  dev/acpica/acpi_acad.c         optional acpi
>>> +dev/acpica/acpi_apei.c         optional acpi
>>>  dev/acpica/acpi_battery.c      optional acpi
>>>  dev/acpica/acpi_button.c       optional acpi
>>>  dev/acpica/acpi_cmbat.c                optional acpi
>>>
>>> Modified: stable/12/sys/dev/acpica/acpi.c
>>> ==============================================================================
>>> --- stable/12/sys/dev/acpica/acpi.c     Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/dev/acpica/acpi.c     Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -151,6 +151,7 @@ static ACPI_STATUS acpi_device_scan_children(device_t
>>>                     int max_depth, acpi_scan_cb_t user_fn, void *arg);
>>>  static int     acpi_isa_pnp_probe(device_t bus, device_t child,
>>>                     struct isa_pnp_id *ids);
>>> +static void    acpi_platform_osc(device_t dev);
>>>  static void    acpi_probe_children(device_t bus);
>>>  static void    acpi_probe_order(ACPI_HANDLE handle, int *order);
>>>  static ACPI_STATUS acpi_probe_child(ACPI_HANDLE handle, UINT32 level,
>>> @@ -673,6 +674,8 @@ acpi_attach(device_t dev)
>>>      /* Register ACPI again to pass the correct argument of pm_func. */
>>>      power_pm_register(POWER_PM_TYPE_ACPI, acpi_pm_func, sc);
>>>
>>> +    acpi_platform_osc(dev);
>>> +
>>>      if (!acpi_disabled("bus")) {
>>>         EVENTHANDLER_REGISTER(dev_lookup, acpi_lookup, NULL, 1000);
>>>         acpi_probe_children(dev);
>>> @@ -1919,6 +1922,34 @@ acpi_enable_pcie(void)
>>>                 alloc++;
>>>         }
>>>  #endif
>>> +}
>>> +
>>> +static void
>>> +acpi_platform_osc(device_t dev)
>>> +{
>>> +       ACPI_HANDLE sb_handle;
>>> +       ACPI_STATUS status;
>>> +       uint32_t cap_set[2];
>>> +
>>> +       /* 0811B06E-4A27-44F9-8D60-3CBBC22E7B48 */
>>> +       static uint8_t acpi_platform_uuid[ACPI_UUID_LENGTH] = {
>>> +               0x6e, 0xb0, 0x11, 0x08, 0x27, 0x4a, 0xf9, 0x44,
>>> +               0x8d, 0x60, 0x3c, 0xbb, 0xc2, 0x2e, 0x7b, 0x48
>>> +       };
>>> +
>>> +       if (ACPI_FAILURE(AcpiGetHandle(ACPI_ROOT_OBJECT, "\\_SB_", &sb_handle)))
>>> +               return;
>>> +
>>> +       cap_set[1] = 0x10;      /* APEI Support */
>>> +       status = acpi_EvaluateOSC(sb_handle, acpi_platform_uuid, 1,
>>> +           nitems(cap_set), cap_set, cap_set, false);
>>> +       if (ACPI_FAILURE(status)) {
>>> +               if (status == AE_NOT_FOUND)
>>> +                       return;
>>> +               device_printf(dev, "_OSC failed: %s\n",
>>> +                   AcpiFormatException(status));
>>> +               return;
>>> +       }
>>>  }
>>>
>>>  /*
>>>
>>> Copied and modified: stable/12/sys/dev/acpica/acpi_apei.c (from r363624, head/sys/dev/acpica/acpi_apei.c)
>>> ==============================================================================
>>> --- head/sys/dev/acpica/acpi_apei.c     Mon Jul 27 21:19:41 2020        (r363624, copy source)
>>> +++ stable/12/sys/dev/acpica/acpi_apei.c        Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -574,7 +574,7 @@ apei_probe(device_t dev)
>>>         if (acpi_find_table(ACPI_SIG_HEST) == 0)
>>>                 return (ENXIO);
>>>         if (acpi_get_handle(dev) != NULL)
>>> -               rv = ACPI_ID_PROBE(device_get_parent(dev), dev, apei_ids, NULL);
>>> +               rv = (ACPI_ID_PROBE(device_get_parent(dev), dev, apei_ids) != NULL);
>>>         else
>>>                 rv = 0;
>>>         if (rv <= 0)
>>>
>>> Modified: stable/12/sys/dev/pci/pci.c
>>> ==============================================================================
>>> --- stable/12/sys/dev/pci/pci.c Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/dev/pci/pci.c Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -6284,6 +6284,67 @@ pcie_get_max_completion_timeout(device_t dev)
>>>         }
>>>  }
>>>
>>> +void
>>> +pcie_apei_error(device_t dev, int sev, uint8_t *aerp)
>>> +{
>>> +       struct pci_devinfo *dinfo = device_get_ivars(dev);
>>> +       const char *s;
>>> +       int aer;
>>> +       uint32_t r, r1;
>>> +       uint16_t rs;
>>> +
>>> +       if (sev == PCIEM_STA_CORRECTABLE_ERROR)
>>> +               s = "Correctable";
>>> +       else if (sev == PCIEM_STA_NON_FATAL_ERROR)
>>> +               s = "Uncorrectable (Non-Fatal)";
>>> +       else
>>> +               s = "Uncorrectable (Fatal)";
>>> +       device_printf(dev, "%s PCIe error reported by APEI\n", s);
>>> +       if (aerp) {
>>> +               if (sev == PCIEM_STA_CORRECTABLE_ERROR) {
>>> +                       r = le32dec(aerp + PCIR_AER_COR_STATUS);
>>> +                       r1 = le32dec(aerp + PCIR_AER_COR_MASK);
>>> +               } else {
>>> +                       r = le32dec(aerp + PCIR_AER_UC_STATUS);
>>> +                       r1 = le32dec(aerp + PCIR_AER_UC_MASK);
>>> +               }
>>> +               device_printf(dev, "status 0x%08x mask 0x%08x", r, r1);
>>> +               if (sev != PCIEM_STA_CORRECTABLE_ERROR) {
>>> +                       r = le32dec(aerp + PCIR_AER_UC_SEVERITY);
>>> +                       rs = le16dec(aerp + PCIR_AER_CAP_CONTROL);
>>> +                       printf(" severity 0x%08x first %d\n",
>>> +                           r, rs & 0x1f);
>>> +               } else
>>> +                       printf("\n");
>>> +       }
>>> +
>>> +       /* As kind of recovery just report and clear the error statuses. */
>>> +       if (pci_find_extcap(dev, PCIZ_AER, &aer) == 0) {
>>> +               r = pci_read_config(dev, aer + PCIR_AER_UC_STATUS, 4);
>>> +               if (r != 0) {
>>> +                       pci_write_config(dev, aer + PCIR_AER_UC_STATUS, r, 4);
>>> +                       device_printf(dev, "Clearing UC AER errors 0x%08x\n", r);
>>> +               }
>>> +
>>> +               r = pci_read_config(dev, aer + PCIR_AER_COR_STATUS, 4);
>>> +               if (r != 0) {
>>> +                       pci_write_config(dev, aer + PCIR_AER_COR_STATUS, r, 4);
>>> +                       device_printf(dev, "Clearing COR AER errors 0x%08x\n", r);
>>> +               }
>>> +       }
>>> +       if (dinfo->cfg.pcie.pcie_location != 0) {
>>> +               rs = pci_read_config(dev, dinfo->cfg.pcie.pcie_location +
>>> +                   PCIER_DEVICE_STA, 2);
>>> +               if ((rs & (PCIEM_STA_CORRECTABLE_ERROR |
>>> +                   PCIEM_STA_NON_FATAL_ERROR | PCIEM_STA_FATAL_ERROR |
>>> +                   PCIEM_STA_UNSUPPORTED_REQ)) != 0) {
>>> +                       pci_write_config(dev, dinfo->cfg.pcie.pcie_location +
>>> +                           PCIER_DEVICE_STA, rs, 2);
>>> +                       device_printf(dev, "Clearing PCIe errors 0x%04x\n", rs);
>>> +               }
>>> +       }
>>> +}
>>> +
>>>  /*
>>>   * Perform a Function Level Reset (FLR) on a device.
>>>   *
>>>
>>> Modified: stable/12/sys/dev/pci/pcivar.h
>>> ==============================================================================
>>> --- stable/12/sys/dev/pci/pcivar.h      Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/dev/pci/pcivar.h      Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -679,6 +679,7 @@ uint32_t pcie_read_config(device_t dev, int reg, int w
>>>  void   pcie_write_config(device_t dev, int reg, uint32_t value, int width);
>>>  uint32_t pcie_adjust_config(device_t dev, int reg, uint32_t mask,
>>>             uint32_t value, int width);
>>> +void   pcie_apei_error(device_t dev, int sev, uint8_t *aer);
>>>  bool   pcie_flr(device_t dev, u_int max_delay, bool force);
>>>  int    pcie_get_max_completion_timeout(device_t dev);
>>>  bool   pcie_wait_for_pending_transactions(device_t dev, u_int max_delay);
>>>
>>> Modified: stable/12/sys/x86/include/acpica_machdep.h
>>> ==============================================================================
>>> --- stable/12/sys/x86/include/acpica_machdep.h  Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/x86/include/acpica_machdep.h  Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -84,6 +84,7 @@ void  madt_parse_interrupt_values(void *entry,
>>>             enum intr_trigger *trig, enum intr_polarity *pol);
>>>
>>>  extern int madt_found_sci_override;
>>> +extern int (*apei_nmi)(void);
>>>
>>>  #endif /* _KERNEL */
>>>
>>>
>>> Modified: stable/12/sys/x86/x86/cpu_machdep.c
>>> ==============================================================================
>>> --- stable/12/sys/x86/x86/cpu_machdep.c Fri Aug  7 00:56:20 2020        (r364002)
>>> +++ stable/12/sys/x86/x86/cpu_machdep.c Fri Aug  7 01:05:10 2020        (r364003)
>>> @@ -811,6 +811,7 @@ int nmi_is_broadcast = 1;
>>>  SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
>>>      &nmi_is_broadcast, 0,
>>>      "Chipset NMI is broadcast");
>>> +int (*apei_nmi)(void);
>>>
>>>  void
>>>  nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
>>> @@ -825,6 +826,10 @@ nmi_call_kdb(u_int cpu, u_int type, struct trapframe *
>>>                         panic("NMI indicates hardware failure");
>>>         }
>>>  #endif /* DEV_ISA */
>>> +
>>> +       /* ACPI Platform Error Interfaces callback. */
>>> +       if (apei_nmi != NULL && (*apei_nmi)())
>>> +               claimed = true;
>>>
>>>         /*
>>>          * NMIs can be useful for debugging.  They can be hooked up to a
>>> _______________________________________________
>>> svn-src-all at freebsd.org mailing list
>>> https://lists.freebsd.org/mailman/listinfo/svn-src-all
>>> To unsubscribe, send any mail to "svn-src-all-unsubscribe at freebsd.org"
> 

-- 
Alexander Motin


More information about the svn-src-all mailing list