Panic with mca trap

Thu Feb 3 16:39:36 UTC 2011

On Thu, Feb 3, 2011 at 7:09 AM, John Baldwin <jhb at freebsd.org> wrote:
> On Thursday, February 03, 2011 8:05:31 am John Baldwin wrote:
>> On Tuesday, February 01, 2011 11:58:12 am mdf at freebsd.org wrote:
>> > On a piece of hardware trying to verify basic build tests, we got an
>> > MCA exception that then panic'd the kernel due to WITNESS/INVARIANTS
>> > interaction.
>> >
>> > panic @ time 1296563157.510, thread 0xffffff0005540000: blockable
>> > sleep lock (sleep mutex) 128 @ /build/mnt/src/sys/vm/uma_core.c:1872
>> >
>> > Stack: --------------------------------------------------
>> > kernel:witness_checkorder+0x7a2
>> > kernel:_mtx_lock_flags+0x81
>> > kernel:uma_zalloc_arg+0x256
>> > kernel:malloc+0xc5
>> > kernel:mca_record_entry+0x30
>> > kernel:mca_scan+0xc9
>> > kernel:mca_intr+0x79
>> > kernel:trap+0x30b
>> > kernel:witness_checkorder+0x66
>> > kernel:_mtx_lock_spin_flags+0xa4
>> > kernel:witness_checkorder+0x2a8
>> > kernel:_mtx_lock_spin_flags+0xa4
>> > kernel:tdq_idled+0xe8
>> > kernel:sched_idletd+0x5b
>> > kernel:fork_exit+0x9b
>> >
>> > That's this bit of code in uma_zalloc_arg():
>> >
>> > #ifdef INVARIANTS
>> >                         ZONE_LOCK(zone);
>> >                         uma_dbg_alloc(zone, NULL, item);
>> >                         ZONE_UNLOCK(zone);
>> > #endif
>> >
>> >
>> > I don't know uma(9) well enough to know the best workaround.  Clearly
>> > there are times we can be in uma_zalloc_arg() and taking a regular
>> > mutex is not acceptable.  But what to do for the uma_dbg_free() call
>> > so it's happy, and whether to guard taking the ZONE lock with M_NOWAIT
>> > or td_critnest > 0 or both is outside my current knowledge.
>> >
>> > I don't expect we'll see this panic again any time soon, but it would
>> > be nice to fix the story for WITNESS of when an M_NOWAIT allocation
>> > can be done.
>>
>> Actually, this is more my fault.  The machine check happened while the
>> interrupted thread was already in a critical section (hence the WITNESS
>> complaint).  However, it really isn't correct to be calling malloc() from an
>> arbitrary exception handler, especially one like MC# which can fire pretty
>> much anywhere.  I think instead that we should use malloc() when polling the
>> machine check banks, but keep a pre-allocated pool of structures for use with
>> MC# exceptions and CMC interrupts and replenish the pool via an asynchronous
>> task.
>
> This is what I'm thinking:
>
> Index: mca.c
> ===================================================================
> --- mca.c       (revision 218221)
> +++ mca.c       (working copy)
> @@ -85,6 +85,7 @@
>  static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
>
>  static int mca_count;          /* Number of records stored. */
> +static int mca_banks;          /* Number of per-CPU register banks. */
>
>  SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Architecture");
>
> @@ -102,6 +103,8 @@
>  SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erratum383, 0,
>     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
>
> +static STAILQ_HEAD(, mca_internal) mca_freelist;
> +static int mca_freecount;
>  static STAILQ_HEAD(, mca_internal) mca_records;
>  static struct callout mca_timer;
>  static int mca_ticks = 3600;   /* Check hourly by default. */
> @@ -111,7 +114,6 @@
>
>  #ifdef DEV_APIC
>  static struct cmc_state **cmc_state;   /* Indexed by cpuid, bank */
> -static int cmc_banks;
>  static int cmc_throttle = 60;  /* Time in seconds to throttle CMCI. */
>  #endif
>
> @@ -415,21 +417,51 @@
>        return (1);
>  }
>
> -static void __nonnull(1)
> -mca_record_entry(const struct mca_record *record)
> +static void
> +mca_fill_freelist(void)
>  {
>        struct mca_internal *rec;
> +       int desired;
>
> -       rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
> -       if (rec == NULL) {
> -               printf("MCA: Unable to allocate space for an event.\n");
> -               mca_log(record);
> -               return;
> +       /*
> +        * Ensure we have at least one record for each bank and one
> +        * record per CPU.
> +        */
> +       desired = imax(mp_ncpus, mca_banks);
> +       mtx_lock_spin(&mca_lock);
> +       while (desired < mca_freecount) {
> +               mtx_unlock_spin(&mca_lock);
> +               rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
> +               mtx_lock_spin(&mca_lock);
> +               STAILQ_INSERT_TAIL(&mca_freelist, rec, link);
> +               mca_freecount++;
>        }
> +       mtx_unlock_spin(&mca_lock);
> +}
>
> +static void __nonnull(2)
> +mca_record_entry(enum scan_mode mode, const struct mca_record *record)
> +{
> +       struct mca_internal *rec;
> +
> +       if (mode == POLLED) {
> +               rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
> +               mtx_lock_spin(&mca_lock);
> +       } else {
> +               mtx_lock_spin(&mca_lock);
> +               rec = STAILQ_FIRST(&mca_freelist);
> +               if (rec == NULL) {
> +                       mtx_unlock_spin(&mca_lock);
> +                       printf("MCA: Unable to allocate space for an event.\n");
> +                       mca_log(record);
> +                       return;
> +               }
> +               STAILQ_REMOVE_HEAD(&mca_freelist, link);
> +               mca_freecount--;
> +       }
> +
>        rec->rec = *record;
>        rec->logged = 0;
> -       mtx_lock_spin(&mca_lock);
>        STAILQ_INSERT_TAIL(&mca_records, rec, link);
>        mca_count++;
>        mtx_unlock_spin(&mca_lock);
> @@ -551,7 +583,7 @@
>                                recoverable = 0;
>                                mca_log(&rec);
>                        }
> -                       mca_record_entry(&rec);
> +                       mca_record_entry(mode, &rec);
>                }
>
>  #ifdef DEV_APIC
> @@ -563,6 +595,8 @@
>                        cmci_update(mode, i, valid, &rec);
>  #endif
>        }
> +       if (mode == POLLED)
> +               mca_fill_freelist();
>        return (mode == MCE ? recoverable : count);
>  }
>
> @@ -642,15 +676,14 @@
>
>  #ifdef DEV_APIC
>  static void
> -cmci_setup(uint64_t mcg_cap)
> +cmci_setup(void)
>  {
>        int i;
>
>        cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
>            M_MCA, M_WAITOK);
> -       cmc_banks = mcg_cap & MCG_CAP_COUNT;
>        for (i = 0; i <= mp_maxid; i++)
> -               cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
> +               cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
>                    M_MCA, M_WAITOK | M_ZERO);
>        SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
>            "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
> @@ -672,10 +705,13 @@
>            CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
>                workaround_erratum383 = 1;
>
> +       mca_banks = mcg_cap & MCG_CAP_COUNT;
>        mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
>        STAILQ_INIT(&mca_records);
>        TASK_INIT(&mca_task, 0, mca_scan_cpus, NULL);
>        callout_init(&mca_timer, CALLOUT_MPSAFE);
> +       STAILQ_INIT(&mca_freelist);
> +       mca_fill_freelist();
>        SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
>            "count", CTLFLAG_RD, &mca_count, 0, "Record count");
>        SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
> @@ -689,7 +725,7 @@
>            sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
>  #ifdef DEV_APIC
>        if (mcg_cap & MCG_CAP_CMCI_P)
> -               cmci_setup(mcg_cap);
> +               cmci_setup();
>  #endif
>  }
>
> @@ -707,7 +743,7 @@
>        struct cmc_state *cc;
>        uint64_t ctl;
>
> -       KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
> +       KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
>
>        ctl = rdmsr(MSR_MC_CTL2(i));
>        if (ctl & MC_CTL2_CMCI_EN)
> @@ -751,7 +787,7 @@
>        struct cmc_state *cc;
>        uint64_t ctl;
>
> -       KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
> +       KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
>
>        /* Ignore banks not monitored by this CPU. */
>        if (!(PCPU_GET(cmci_mask) & 1 << i))

This generally looks right to me.  I can't test it locally since we're
trying to get a release out the door, and I think we've already
replaced the bad CPU on the machine that failed.

Thanks,
matthew